123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525 |
- /*
- * cjkcodecs.h: common header for cjkcodecs
- *
- * Written by Hye-Shik Chang <perky@FreeBSD.org>
- */
- #ifndef _CJKCODECS_H_
- #define _CJKCODECS_H_
- #define PY_SSIZE_T_CLEAN
- #include "Python.h"
- #include "multibytecodec.h"
- /* a unicode "undefined" code point */
- #define UNIINV 0xFFFE
- /* internal-use DBCS code points which aren't used by any charsets */
- #define NOCHAR 0xFFFF
- #define MULTIC 0xFFFE
- #define DBCINV 0xFFFD
- /* shorter macros to save source size of mapping tables */
- #define U UNIINV
- #define N NOCHAR
- #define M MULTIC
- #define D DBCINV
- struct dbcs_index {
- const ucs2_t *map;
- unsigned char bottom, top;
- };
- typedef struct dbcs_index decode_map;
- struct widedbcs_index {
- const Py_UCS4 *map;
- unsigned char bottom, top;
- };
- typedef struct widedbcs_index widedecode_map;
- struct unim_index {
- const DBCHAR *map;
- unsigned char bottom, top;
- };
- typedef struct unim_index encode_map;
- struct unim_index_bytebased {
- const unsigned char *map;
- unsigned char bottom, top;
- };
- struct dbcs_map {
- const char *charset;
- const struct unim_index *encmap;
- const struct dbcs_index *decmap;
- };
- struct pair_encodemap {
- Py_UCS4 uniseq;
- DBCHAR code;
- };
- #ifndef CJK_MOD_SPECIFIC_STATE
- #define CJK_MOD_SPECIFIC_STATE
- #endif
- typedef struct _cjk_mod_state {
- int num_mappings;
- int num_codecs;
- struct dbcs_map *mapping_list;
- MultibyteCodec *codec_list;
- CJK_MOD_SPECIFIC_STATE
- } cjkcodecs_module_state;
- static inline cjkcodecs_module_state *
- get_module_state(PyObject *mod)
- {
- void *state = PyModule_GetState(mod);
- assert(state != NULL);
- return (cjkcodecs_module_state *)state;
- }
- #define CODEC_INIT(encoding) \
- static int encoding##_codec_init(const MultibyteCodec *codec)
- #define ENCODER_INIT(encoding) \
- static int encoding##_encode_init( \
- MultibyteCodec_State *state, const MultibyteCodec *codec)
- #define ENCODER(encoding) \
- static Py_ssize_t encoding##_encode( \
- MultibyteCodec_State *state, const MultibyteCodec *codec, \
- int kind, const void *data, \
- Py_ssize_t *inpos, Py_ssize_t inlen, \
- unsigned char **outbuf, Py_ssize_t outleft, int flags)
- #define ENCODER_RESET(encoding) \
- static Py_ssize_t encoding##_encode_reset( \
- MultibyteCodec_State *state, const MultibyteCodec *codec, \
- unsigned char **outbuf, Py_ssize_t outleft)
- #define DECODER_INIT(encoding) \
- static int encoding##_decode_init( \
- MultibyteCodec_State *state, const MultibyteCodec *codec)
- #define DECODER(encoding) \
- static Py_ssize_t encoding##_decode( \
- MultibyteCodec_State *state, const MultibyteCodec *codec, \
- const unsigned char **inbuf, Py_ssize_t inleft, \
- _PyUnicodeWriter *writer)
- #define DECODER_RESET(encoding) \
- static Py_ssize_t encoding##_decode_reset( \
- MultibyteCodec_State *state, const MultibyteCodec *codec)
- #define NEXT_IN(i) \
- do { \
- (*inbuf) += (i); \
- (inleft) -= (i); \
- } while (0)
- #define NEXT_INCHAR(i) \
- do { \
- (*inpos) += (i); \
- } while (0)
- #define NEXT_OUT(o) \
- do { \
- (*outbuf) += (o); \
- (outleft) -= (o); \
- } while (0)
- #define NEXT(i, o) \
- do { \
- NEXT_INCHAR(i); \
- NEXT_OUT(o); \
- } while (0)
- #define REQUIRE_INBUF(n) \
- do { \
- if (inleft < (n)) \
- return MBERR_TOOFEW; \
- } while (0)
- #define REQUIRE_OUTBUF(n) \
- do { \
- if (outleft < (n)) \
- return MBERR_TOOSMALL; \
- } while (0)
- #define INBYTE1 ((*inbuf)[0])
- #define INBYTE2 ((*inbuf)[1])
- #define INBYTE3 ((*inbuf)[2])
- #define INBYTE4 ((*inbuf)[3])
- #define INCHAR1 (PyUnicode_READ(kind, data, *inpos))
- #define INCHAR2 (PyUnicode_READ(kind, data, *inpos + 1))
- #define OUTCHAR(c) \
- do { \
- if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) \
- return MBERR_EXCEPTION; \
- } while (0)
- #define OUTCHAR2(c1, c2) \
- do { \
- Py_UCS4 _c1 = (c1); \
- Py_UCS4 _c2 = (c2); \
- if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0) \
- return MBERR_EXCEPTION; \
- PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1); \
- PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \
- writer->pos += 2; \
- } while (0)
- #define OUTBYTEI(c, i) \
- do { \
- assert((unsigned char)(c) == (c)); \
- ((*outbuf)[i]) = (c); \
- } while (0)
- #define OUTBYTE1(c) OUTBYTEI(c, 0)
- #define OUTBYTE2(c) OUTBYTEI(c, 1)
- #define OUTBYTE3(c) OUTBYTEI(c, 2)
- #define OUTBYTE4(c) OUTBYTEI(c, 3)
- #define WRITEBYTE1(c1) \
- do { \
- REQUIRE_OUTBUF(1); \
- OUTBYTE1(c1); \
- } while (0)
- #define WRITEBYTE2(c1, c2) \
- do { \
- REQUIRE_OUTBUF(2); \
- OUTBYTE1(c1); \
- OUTBYTE2(c2); \
- } while (0)
- #define WRITEBYTE3(c1, c2, c3) \
- do { \
- REQUIRE_OUTBUF(3); \
- OUTBYTE1(c1); \
- OUTBYTE2(c2); \
- OUTBYTE3(c3); \
- } while (0)
- #define WRITEBYTE4(c1, c2, c3, c4) \
- do { \
- REQUIRE_OUTBUF(4); \
- OUTBYTE1(c1); \
- OUTBYTE2(c2); \
- OUTBYTE3(c3); \
- OUTBYTE4(c4); \
- } while (0)
- #define _TRYMAP_ENC(m, assi, val) \
- ((m)->map != NULL && (val) >= (m)->bottom && \
- (val)<= (m)->top && ((assi) = (m)->map[(val) - \
- (m)->bottom]) != NOCHAR)
- #define TRYMAP_ENC(charset, assi, uni) \
- _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
- #define TRYMAP_ENC_ST(charset, assi, uni) \
- _TRYMAP_ENC(&(codec->modstate->charset##_encmap)[(uni) >> 8], \
- assi, (uni) & 0xff)
- #define _TRYMAP_DEC(m, assi, val) \
- ((m)->map != NULL && \
- (val) >= (m)->bottom && \
- (val)<= (m)->top && \
- ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV)
- #define TRYMAP_DEC(charset, assi, c1, c2) \
- _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
- #define TRYMAP_DEC_ST(charset, assi, c1, c2) \
- _TRYMAP_DEC(&(codec->modstate->charset##_decmap)[c1], assi, c2)
- #define BEGIN_MAPPINGS_LIST(NUM) \
- static int \
- add_mappings(cjkcodecs_module_state *st) \
- { \
- int idx = 0; \
- (void)idx; \
- st->num_mappings = NUM; \
- st->mapping_list = PyMem_Calloc(NUM, sizeof(struct dbcs_map)); \
- if (st->mapping_list == NULL) { \
- return -1; \
- }
- #define MAPPING_ENCONLY(enc) \
- st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, NULL};
- #define MAPPING_DECONLY(enc) \
- st->mapping_list[idx++] = (struct dbcs_map){#enc, NULL, (void*)enc##_decmap};
- #define MAPPING_ENCDEC(enc) \
- st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, (void*)enc##_decmap};
- #define END_MAPPINGS_LIST \
- assert(st->num_mappings == idx); \
- return 0; \
- }
- #define BEGIN_CODECS_LIST(NUM) \
- static int \
- add_codecs(cjkcodecs_module_state *st) \
- { \
- int idx = 0; \
- (void)idx; \
- st->num_codecs = NUM; \
- st->codec_list = PyMem_Calloc(NUM, sizeof(MultibyteCodec)); \
- if (st->codec_list == NULL) { \
- return -1; \
- }
- #define _STATEFUL_METHODS(enc) \
- enc##_encode, \
- enc##_encode_init, \
- enc##_encode_reset, \
- enc##_decode, \
- enc##_decode_init, \
- enc##_decode_reset,
- #define _STATELESS_METHODS(enc) \
- enc##_encode, NULL, NULL, \
- enc##_decode, NULL, NULL,
- #define NEXT_CODEC \
- st->codec_list[idx++]
- #define CODEC_STATEFUL(enc) \
- NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATEFUL_METHODS(enc)};
- #define CODEC_STATELESS(enc) \
- NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATELESS_METHODS(enc)};
- #define CODEC_STATELESS_WINIT(enc) \
- NEXT_CODEC = (MultibyteCodec){#enc, NULL, enc##_codec_init, _STATELESS_METHODS(enc)};
- #define END_CODECS_LIST \
- assert(st->num_codecs == idx); \
- for (int i = 0; i < st->num_codecs; i++) { \
- st->codec_list[i].modstate = st; \
- } \
- return 0; \
- }
- static PyObject *
- getmultibytecodec(void)
- {
- return _PyImport_GetModuleAttrString("_multibytecodec", "__create_codec");
- }
- static void
- destroy_codec_capsule(PyObject *capsule)
- {
- void *ptr = PyCapsule_GetPointer(capsule, CODEC_CAPSULE);
- codec_capsule *data = (codec_capsule *)ptr;
- Py_DECREF(data->cjk_module);
- PyMem_Free(ptr);
- }
- static codec_capsule *
- capsulate_codec(PyObject *mod, const MultibyteCodec *codec)
- {
- codec_capsule *data = PyMem_Malloc(sizeof(codec_capsule));
- if (data == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- data->codec = codec;
- data->cjk_module = Py_NewRef(mod);
- return data;
- }
- static PyObject *
- _getcodec(PyObject *self, const MultibyteCodec *codec)
- {
- PyObject *cofunc = getmultibytecodec();
- if (cofunc == NULL) {
- return NULL;
- }
- codec_capsule *data = capsulate_codec(self, codec);
- if (data == NULL) {
- Py_DECREF(cofunc);
- return NULL;
- }
- PyObject *codecobj = PyCapsule_New(data, CODEC_CAPSULE,
- destroy_codec_capsule);
- if (codecobj == NULL) {
- PyMem_Free(data);
- Py_DECREF(cofunc);
- return NULL;
- }
- PyObject *res = PyObject_CallOneArg(cofunc, codecobj);
- Py_DECREF(codecobj);
- Py_DECREF(cofunc);
- return res;
- }
- static PyObject *
- getcodec(PyObject *self, PyObject *encoding)
- {
- if (!PyUnicode_Check(encoding)) {
- PyErr_SetString(PyExc_TypeError,
- "encoding name must be a string.");
- return NULL;
- }
- const char *enc = PyUnicode_AsUTF8(encoding);
- if (enc == NULL) {
- return NULL;
- }
- cjkcodecs_module_state *st = get_module_state(self);
- for (int i = 0; i < st->num_codecs; i++) {
- const MultibyteCodec *codec = &st->codec_list[i];
- if (strcmp(codec->encoding, enc) == 0) {
- return _getcodec(self, codec);
- }
- }
- PyErr_SetString(PyExc_LookupError,
- "no such codec is supported.");
- return NULL;
- }
- static int add_mappings(cjkcodecs_module_state *);
- static int add_codecs(cjkcodecs_module_state *);
- static int
- register_maps(PyObject *module)
- {
- // Init module state.
- cjkcodecs_module_state *st = get_module_state(module);
- if (add_mappings(st) < 0) {
- return -1;
- }
- if (add_codecs(st) < 0) {
- return -1;
- }
- for (int i = 0; i < st->num_mappings; i++) {
- const struct dbcs_map *h = &st->mapping_list[i];
- char mhname[256] = "__map_";
- strcpy(mhname + sizeof("__map_") - 1, h->charset);
- PyObject *capsule = PyCapsule_New((void *)h, MAP_CAPSULE, NULL);
- if (capsule == NULL) {
- return -1;
- }
- if (PyModule_AddObject(module, mhname, capsule) < 0) {
- Py_DECREF(capsule);
- return -1;
- }
- }
- return 0;
- }
- #ifdef USING_BINARY_PAIR_SEARCH
- static DBCHAR
- find_pairencmap(ucs2_t body, ucs2_t modifier,
- const struct pair_encodemap *haystack, int haystacksize)
- {
- int pos, min, max;
- Py_UCS4 value = body << 16 | modifier;
- min = 0;
- max = haystacksize;
- for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1) {
- if (value < haystack[pos].uniseq) {
- if (max != pos) {
- max = pos;
- continue;
- }
- }
- else if (value > haystack[pos].uniseq) {
- if (min != pos) {
- min = pos;
- continue;
- }
- }
- break;
- }
- if (value == haystack[pos].uniseq) {
- return haystack[pos].code;
- }
- return DBCINV;
- }
- #endif
- #ifdef USING_IMPORTED_MAPS
- #define IMPORT_MAP(locale, charset, encmap, decmap) \
- importmap("_codecs_" #locale, "__map_" #charset, \
- (const void**)encmap, (const void**)decmap)
- static int
- importmap(const char *modname, const char *symbol,
- const void **encmap, const void **decmap)
- {
- PyObject *o, *mod;
- mod = PyImport_ImportModule(modname);
- if (mod == NULL)
- return -1;
- o = PyObject_GetAttrString(mod, symbol);
- if (o == NULL)
- goto errorexit;
- else if (!PyCapsule_IsValid(o, MAP_CAPSULE)) {
- PyErr_SetString(PyExc_ValueError,
- "map data must be a Capsule.");
- goto errorexit;
- }
- else {
- struct dbcs_map *map;
- map = PyCapsule_GetPointer(o, MAP_CAPSULE);
- if (encmap != NULL)
- *encmap = map->encmap;
- if (decmap != NULL)
- *decmap = map->decmap;
- Py_DECREF(o);
- }
- Py_DECREF(mod);
- return 0;
- errorexit:
- Py_DECREF(mod);
- return -1;
- }
- #endif
- static int
- _cjk_exec(PyObject *module)
- {
- return register_maps(module);
- }
- static void
- _cjk_free(void *mod)
- {
- cjkcodecs_module_state *st = get_module_state((PyObject *)mod);
- PyMem_Free(st->mapping_list);
- PyMem_Free(st->codec_list);
- }
- static struct PyMethodDef _cjk_methods[] = {
- {"getcodec", (PyCFunction)getcodec, METH_O, ""},
- {NULL, NULL},
- };
- static PyModuleDef_Slot _cjk_slots[] = {
- {Py_mod_exec, _cjk_exec},
- {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
- {0, NULL}
- };
- #define I_AM_A_MODULE_FOR(loc) \
- static struct PyModuleDef _cjk_module = { \
- PyModuleDef_HEAD_INIT, \
- .m_name = "_codecs_"#loc, \
- .m_size = sizeof(cjkcodecs_module_state), \
- .m_methods = _cjk_methods, \
- .m_slots = _cjk_slots, \
- .m_free = _cjk_free, \
- }; \
- \
- PyMODINIT_FUNC \
- PyInit__codecs_##loc(void) \
- { \
- return PyModuleDef_Init(&_cjk_module); \
- }
- #endif
|