cjkcodecs.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. /*
  2. * cjkcodecs.h: common header for cjkcodecs
  3. *
  4. * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5. */
  6. #ifndef _CJKCODECS_H_
  7. #define _CJKCODECS_H_
  8. #define PY_SSIZE_T_CLEAN
  9. #include "Python.h"
  10. #include "multibytecodec.h"
  11. /* a unicode "undefined" code point */
  12. #define UNIINV 0xFFFE
  13. /* internal-use DBCS code points which aren't used by any charsets */
  14. #define NOCHAR 0xFFFF
  15. #define MULTIC 0xFFFE
  16. #define DBCINV 0xFFFD
  17. /* shorter macros to save source size of mapping tables */
  18. #define U UNIINV
  19. #define N NOCHAR
  20. #define M MULTIC
  21. #define D DBCINV
  22. struct dbcs_index {
  23. const ucs2_t *map;
  24. unsigned char bottom, top;
  25. };
  26. typedef struct dbcs_index decode_map;
  27. struct widedbcs_index {
  28. const Py_UCS4 *map;
  29. unsigned char bottom, top;
  30. };
  31. typedef struct widedbcs_index widedecode_map;
  32. struct unim_index {
  33. const DBCHAR *map;
  34. unsigned char bottom, top;
  35. };
  36. typedef struct unim_index encode_map;
  37. struct unim_index_bytebased {
  38. const unsigned char *map;
  39. unsigned char bottom, top;
  40. };
  41. struct dbcs_map {
  42. const char *charset;
  43. const struct unim_index *encmap;
  44. const struct dbcs_index *decmap;
  45. };
  46. struct pair_encodemap {
  47. Py_UCS4 uniseq;
  48. DBCHAR code;
  49. };
  50. #ifndef CJK_MOD_SPECIFIC_STATE
  51. #define CJK_MOD_SPECIFIC_STATE
  52. #endif
  53. typedef struct _cjk_mod_state {
  54. int num_mappings;
  55. int num_codecs;
  56. struct dbcs_map *mapping_list;
  57. MultibyteCodec *codec_list;
  58. CJK_MOD_SPECIFIC_STATE
  59. } cjkcodecs_module_state;
  60. static inline cjkcodecs_module_state *
  61. get_module_state(PyObject *mod)
  62. {
  63. void *state = PyModule_GetState(mod);
  64. assert(state != NULL);
  65. return (cjkcodecs_module_state *)state;
  66. }
  67. #define CODEC_INIT(encoding) \
  68. static int encoding##_codec_init(const MultibyteCodec *codec)
  69. #define ENCODER_INIT(encoding) \
  70. static int encoding##_encode_init( \
  71. MultibyteCodec_State *state, const MultibyteCodec *codec)
  72. #define ENCODER(encoding) \
  73. static Py_ssize_t encoding##_encode( \
  74. MultibyteCodec_State *state, const MultibyteCodec *codec, \
  75. int kind, const void *data, \
  76. Py_ssize_t *inpos, Py_ssize_t inlen, \
  77. unsigned char **outbuf, Py_ssize_t outleft, int flags)
  78. #define ENCODER_RESET(encoding) \
  79. static Py_ssize_t encoding##_encode_reset( \
  80. MultibyteCodec_State *state, const MultibyteCodec *codec, \
  81. unsigned char **outbuf, Py_ssize_t outleft)
  82. #define DECODER_INIT(encoding) \
  83. static int encoding##_decode_init( \
  84. MultibyteCodec_State *state, const MultibyteCodec *codec)
  85. #define DECODER(encoding) \
  86. static Py_ssize_t encoding##_decode( \
  87. MultibyteCodec_State *state, const MultibyteCodec *codec, \
  88. const unsigned char **inbuf, Py_ssize_t inleft, \
  89. _PyUnicodeWriter *writer)
  90. #define DECODER_RESET(encoding) \
  91. static Py_ssize_t encoding##_decode_reset( \
  92. MultibyteCodec_State *state, const MultibyteCodec *codec)
  93. #define NEXT_IN(i) \
  94. do { \
  95. (*inbuf) += (i); \
  96. (inleft) -= (i); \
  97. } while (0)
  98. #define NEXT_INCHAR(i) \
  99. do { \
  100. (*inpos) += (i); \
  101. } while (0)
  102. #define NEXT_OUT(o) \
  103. do { \
  104. (*outbuf) += (o); \
  105. (outleft) -= (o); \
  106. } while (0)
  107. #define NEXT(i, o) \
  108. do { \
  109. NEXT_INCHAR(i); \
  110. NEXT_OUT(o); \
  111. } while (0)
  112. #define REQUIRE_INBUF(n) \
  113. do { \
  114. if (inleft < (n)) \
  115. return MBERR_TOOFEW; \
  116. } while (0)
  117. #define REQUIRE_OUTBUF(n) \
  118. do { \
  119. if (outleft < (n)) \
  120. return MBERR_TOOSMALL; \
  121. } while (0)
  122. #define INBYTE1 ((*inbuf)[0])
  123. #define INBYTE2 ((*inbuf)[1])
  124. #define INBYTE3 ((*inbuf)[2])
  125. #define INBYTE4 ((*inbuf)[3])
  126. #define INCHAR1 (PyUnicode_READ(kind, data, *inpos))
  127. #define INCHAR2 (PyUnicode_READ(kind, data, *inpos + 1))
  128. #define OUTCHAR(c) \
  129. do { \
  130. if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) \
  131. return MBERR_EXCEPTION; \
  132. } while (0)
  133. #define OUTCHAR2(c1, c2) \
  134. do { \
  135. Py_UCS4 _c1 = (c1); \
  136. Py_UCS4 _c2 = (c2); \
  137. if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0) \
  138. return MBERR_EXCEPTION; \
  139. PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1); \
  140. PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \
  141. writer->pos += 2; \
  142. } while (0)
  143. #define OUTBYTEI(c, i) \
  144. do { \
  145. assert((unsigned char)(c) == (c)); \
  146. ((*outbuf)[i]) = (c); \
  147. } while (0)
  148. #define OUTBYTE1(c) OUTBYTEI(c, 0)
  149. #define OUTBYTE2(c) OUTBYTEI(c, 1)
  150. #define OUTBYTE3(c) OUTBYTEI(c, 2)
  151. #define OUTBYTE4(c) OUTBYTEI(c, 3)
  152. #define WRITEBYTE1(c1) \
  153. do { \
  154. REQUIRE_OUTBUF(1); \
  155. OUTBYTE1(c1); \
  156. } while (0)
  157. #define WRITEBYTE2(c1, c2) \
  158. do { \
  159. REQUIRE_OUTBUF(2); \
  160. OUTBYTE1(c1); \
  161. OUTBYTE2(c2); \
  162. } while (0)
  163. #define WRITEBYTE3(c1, c2, c3) \
  164. do { \
  165. REQUIRE_OUTBUF(3); \
  166. OUTBYTE1(c1); \
  167. OUTBYTE2(c2); \
  168. OUTBYTE3(c3); \
  169. } while (0)
  170. #define WRITEBYTE4(c1, c2, c3, c4) \
  171. do { \
  172. REQUIRE_OUTBUF(4); \
  173. OUTBYTE1(c1); \
  174. OUTBYTE2(c2); \
  175. OUTBYTE3(c3); \
  176. OUTBYTE4(c4); \
  177. } while (0)
  178. #define _TRYMAP_ENC(m, assi, val) \
  179. ((m)->map != NULL && (val) >= (m)->bottom && \
  180. (val)<= (m)->top && ((assi) = (m)->map[(val) - \
  181. (m)->bottom]) != NOCHAR)
  182. #define TRYMAP_ENC(charset, assi, uni) \
  183. _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
  184. #define TRYMAP_ENC_ST(charset, assi, uni) \
  185. _TRYMAP_ENC(&(codec->modstate->charset##_encmap)[(uni) >> 8], \
  186. assi, (uni) & 0xff)
  187. #define _TRYMAP_DEC(m, assi, val) \
  188. ((m)->map != NULL && \
  189. (val) >= (m)->bottom && \
  190. (val)<= (m)->top && \
  191. ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV)
  192. #define TRYMAP_DEC(charset, assi, c1, c2) \
  193. _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
  194. #define TRYMAP_DEC_ST(charset, assi, c1, c2) \
  195. _TRYMAP_DEC(&(codec->modstate->charset##_decmap)[c1], assi, c2)
  196. #define BEGIN_MAPPINGS_LIST(NUM) \
  197. static int \
  198. add_mappings(cjkcodecs_module_state *st) \
  199. { \
  200. int idx = 0; \
  201. (void)idx; \
  202. st->num_mappings = NUM; \
  203. st->mapping_list = PyMem_Calloc(NUM, sizeof(struct dbcs_map)); \
  204. if (st->mapping_list == NULL) { \
  205. return -1; \
  206. }
  207. #define MAPPING_ENCONLY(enc) \
  208. st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, NULL};
  209. #define MAPPING_DECONLY(enc) \
  210. st->mapping_list[idx++] = (struct dbcs_map){#enc, NULL, (void*)enc##_decmap};
  211. #define MAPPING_ENCDEC(enc) \
  212. st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, (void*)enc##_decmap};
  213. #define END_MAPPINGS_LIST \
  214. assert(st->num_mappings == idx); \
  215. return 0; \
  216. }
  217. #define BEGIN_CODECS_LIST(NUM) \
  218. static int \
  219. add_codecs(cjkcodecs_module_state *st) \
  220. { \
  221. int idx = 0; \
  222. (void)idx; \
  223. st->num_codecs = NUM; \
  224. st->codec_list = PyMem_Calloc(NUM, sizeof(MultibyteCodec)); \
  225. if (st->codec_list == NULL) { \
  226. return -1; \
  227. }
  228. #define _STATEFUL_METHODS(enc) \
  229. enc##_encode, \
  230. enc##_encode_init, \
  231. enc##_encode_reset, \
  232. enc##_decode, \
  233. enc##_decode_init, \
  234. enc##_decode_reset,
  235. #define _STATELESS_METHODS(enc) \
  236. enc##_encode, NULL, NULL, \
  237. enc##_decode, NULL, NULL,
  238. #define NEXT_CODEC \
  239. st->codec_list[idx++]
  240. #define CODEC_STATEFUL(enc) \
  241. NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATEFUL_METHODS(enc)};
  242. #define CODEC_STATELESS(enc) \
  243. NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATELESS_METHODS(enc)};
  244. #define CODEC_STATELESS_WINIT(enc) \
  245. NEXT_CODEC = (MultibyteCodec){#enc, NULL, enc##_codec_init, _STATELESS_METHODS(enc)};
  246. #define END_CODECS_LIST \
  247. assert(st->num_codecs == idx); \
  248. for (int i = 0; i < st->num_codecs; i++) { \
  249. st->codec_list[i].modstate = st; \
  250. } \
  251. return 0; \
  252. }
  253. static PyObject *
  254. getmultibytecodec(void)
  255. {
  256. return _PyImport_GetModuleAttrString("_multibytecodec", "__create_codec");
  257. }
  258. static void
  259. destroy_codec_capsule(PyObject *capsule)
  260. {
  261. void *ptr = PyCapsule_GetPointer(capsule, CODEC_CAPSULE);
  262. codec_capsule *data = (codec_capsule *)ptr;
  263. Py_DECREF(data->cjk_module);
  264. PyMem_Free(ptr);
  265. }
  266. static codec_capsule *
  267. capsulate_codec(PyObject *mod, const MultibyteCodec *codec)
  268. {
  269. codec_capsule *data = PyMem_Malloc(sizeof(codec_capsule));
  270. if (data == NULL) {
  271. PyErr_NoMemory();
  272. return NULL;
  273. }
  274. data->codec = codec;
  275. data->cjk_module = Py_NewRef(mod);
  276. return data;
  277. }
  278. static PyObject *
  279. _getcodec(PyObject *self, const MultibyteCodec *codec)
  280. {
  281. PyObject *cofunc = getmultibytecodec();
  282. if (cofunc == NULL) {
  283. return NULL;
  284. }
  285. codec_capsule *data = capsulate_codec(self, codec);
  286. if (data == NULL) {
  287. Py_DECREF(cofunc);
  288. return NULL;
  289. }
  290. PyObject *codecobj = PyCapsule_New(data, CODEC_CAPSULE,
  291. destroy_codec_capsule);
  292. if (codecobj == NULL) {
  293. PyMem_Free(data);
  294. Py_DECREF(cofunc);
  295. return NULL;
  296. }
  297. PyObject *res = PyObject_CallOneArg(cofunc, codecobj);
  298. Py_DECREF(codecobj);
  299. Py_DECREF(cofunc);
  300. return res;
  301. }
  302. static PyObject *
  303. getcodec(PyObject *self, PyObject *encoding)
  304. {
  305. if (!PyUnicode_Check(encoding)) {
  306. PyErr_SetString(PyExc_TypeError,
  307. "encoding name must be a string.");
  308. return NULL;
  309. }
  310. const char *enc = PyUnicode_AsUTF8(encoding);
  311. if (enc == NULL) {
  312. return NULL;
  313. }
  314. cjkcodecs_module_state *st = get_module_state(self);
  315. for (int i = 0; i < st->num_codecs; i++) {
  316. const MultibyteCodec *codec = &st->codec_list[i];
  317. if (strcmp(codec->encoding, enc) == 0) {
  318. return _getcodec(self, codec);
  319. }
  320. }
  321. PyErr_SetString(PyExc_LookupError,
  322. "no such codec is supported.");
  323. return NULL;
  324. }
  325. static int add_mappings(cjkcodecs_module_state *);
  326. static int add_codecs(cjkcodecs_module_state *);
  327. static int
  328. register_maps(PyObject *module)
  329. {
  330. // Init module state.
  331. cjkcodecs_module_state *st = get_module_state(module);
  332. if (add_mappings(st) < 0) {
  333. return -1;
  334. }
  335. if (add_codecs(st) < 0) {
  336. return -1;
  337. }
  338. for (int i = 0; i < st->num_mappings; i++) {
  339. const struct dbcs_map *h = &st->mapping_list[i];
  340. char mhname[256] = "__map_";
  341. strcpy(mhname + sizeof("__map_") - 1, h->charset);
  342. PyObject *capsule = PyCapsule_New((void *)h, MAP_CAPSULE, NULL);
  343. if (capsule == NULL) {
  344. return -1;
  345. }
  346. if (PyModule_AddObject(module, mhname, capsule) < 0) {
  347. Py_DECREF(capsule);
  348. return -1;
  349. }
  350. }
  351. return 0;
  352. }
  353. #ifdef USING_BINARY_PAIR_SEARCH
  354. static DBCHAR
  355. find_pairencmap(ucs2_t body, ucs2_t modifier,
  356. const struct pair_encodemap *haystack, int haystacksize)
  357. {
  358. int pos, min, max;
  359. Py_UCS4 value = body << 16 | modifier;
  360. min = 0;
  361. max = haystacksize;
  362. for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1) {
  363. if (value < haystack[pos].uniseq) {
  364. if (max != pos) {
  365. max = pos;
  366. continue;
  367. }
  368. }
  369. else if (value > haystack[pos].uniseq) {
  370. if (min != pos) {
  371. min = pos;
  372. continue;
  373. }
  374. }
  375. break;
  376. }
  377. if (value == haystack[pos].uniseq) {
  378. return haystack[pos].code;
  379. }
  380. return DBCINV;
  381. }
  382. #endif
  383. #ifdef USING_IMPORTED_MAPS
  384. #define IMPORT_MAP(locale, charset, encmap, decmap) \
  385. importmap("_codecs_" #locale, "__map_" #charset, \
  386. (const void**)encmap, (const void**)decmap)
  387. static int
  388. importmap(const char *modname, const char *symbol,
  389. const void **encmap, const void **decmap)
  390. {
  391. PyObject *o, *mod;
  392. mod = PyImport_ImportModule(modname);
  393. if (mod == NULL)
  394. return -1;
  395. o = PyObject_GetAttrString(mod, symbol);
  396. if (o == NULL)
  397. goto errorexit;
  398. else if (!PyCapsule_IsValid(o, MAP_CAPSULE)) {
  399. PyErr_SetString(PyExc_ValueError,
  400. "map data must be a Capsule.");
  401. goto errorexit;
  402. }
  403. else {
  404. struct dbcs_map *map;
  405. map = PyCapsule_GetPointer(o, MAP_CAPSULE);
  406. if (encmap != NULL)
  407. *encmap = map->encmap;
  408. if (decmap != NULL)
  409. *decmap = map->decmap;
  410. Py_DECREF(o);
  411. }
  412. Py_DECREF(mod);
  413. return 0;
  414. errorexit:
  415. Py_DECREF(mod);
  416. return -1;
  417. }
  418. #endif
  419. static int
  420. _cjk_exec(PyObject *module)
  421. {
  422. return register_maps(module);
  423. }
  424. static void
  425. _cjk_free(void *mod)
  426. {
  427. cjkcodecs_module_state *st = get_module_state((PyObject *)mod);
  428. PyMem_Free(st->mapping_list);
  429. PyMem_Free(st->codec_list);
  430. }
  431. static struct PyMethodDef _cjk_methods[] = {
  432. {"getcodec", (PyCFunction)getcodec, METH_O, ""},
  433. {NULL, NULL},
  434. };
  435. static PyModuleDef_Slot _cjk_slots[] = {
  436. {Py_mod_exec, _cjk_exec},
  437. {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
  438. {0, NULL}
  439. };
  440. #define I_AM_A_MODULE_FOR(loc) \
  441. static struct PyModuleDef _cjk_module = { \
  442. PyModuleDef_HEAD_INIT, \
  443. .m_name = "_codecs_"#loc, \
  444. .m_size = sizeof(cjkcodecs_module_state), \
  445. .m_methods = _cjk_methods, \
  446. .m_slots = _cjk_slots, \
  447. .m_free = _cjk_free, \
  448. }; \
  449. \
  450. PyMODINIT_FUNC \
  451. PyInit__codecs_##loc(void) \
  452. { \
  453. return PyModuleDef_Init(&_cjk_module); \
  454. }
  455. #endif