12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544 |
- /* ------------------------------------------------------------------------
- unicodedata -- Provides access to the Unicode database.
- The current version number is reported in the unidata_version constant.
- Written by Marc-Andre Lemburg (mal@lemburg.com).
- Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
- Modified by Martin v. Löwis (martin@v.loewis.de)
- Copyright (c) Corporation for National Research Initiatives.
- ------------------------------------------------------------------------ */
- #ifndef Py_BUILD_CORE_BUILTIN
- # define Py_BUILD_CORE_MODULE 1
- #endif
- #define PY_SSIZE_T_CLEAN
- #include "Python.h"
- #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
- #include "structmember.h" // PyMemberDef
- #include <stdbool.h>
- /*[clinic input]
- module unicodedata
- class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
- [clinic start generated code]*/
- /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
- /* character properties */
- typedef struct {
- const unsigned char category; /* index into
- _PyUnicode_CategoryNames */
- const unsigned char combining; /* combining class value 0 - 255 */
- const unsigned char bidirectional; /* index into
- _PyUnicode_BidirectionalNames */
- const unsigned char mirrored; /* true if mirrored in bidir mode */
- const unsigned char east_asian_width; /* index into
- _PyUnicode_EastAsianWidth */
- const unsigned char normalization_quick_check; /* see is_normalized() */
- } _PyUnicode_DatabaseRecord;
- typedef struct change_record {
- /* sequence of fields should be the same as in merge_old_version */
- const unsigned char bidir_changed;
- const unsigned char category_changed;
- const unsigned char decimal_changed;
- const unsigned char mirrored_changed;
- const unsigned char east_asian_width_changed;
- const double numeric_changed;
- } change_record;
- /* data file generated by Tools/unicode/makeunicodedata.py */
- #include "unicodedata_db.h"
- static const _PyUnicode_DatabaseRecord*
- _getrecord_ex(Py_UCS4 code)
- {
- int index;
- if (code >= 0x110000)
- index = 0;
- else {
- index = index1[(code>>SHIFT)];
- index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
- }
- return &_PyUnicode_Database_Records[index];
- }
- /* ------------- Previous-version API ------------------------------------- */
- typedef struct previous_version {
- PyObject_HEAD
- const char *name;
- const change_record* (*getrecord)(Py_UCS4);
- Py_UCS4 (*normalization)(Py_UCS4);
- } PreviousDBVersion;
- #include "clinic/unicodedata.c.h"
- #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
- static PyMemberDef DB_members[] = {
- {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
- {NULL}
- };
- // Check if self is an unicodedata.UCD instance.
- // If self is NULL (when the PyCapsule C API is used), return 0.
- // PyModule_Check() is used to avoid having to retrieve the ucd_type.
- // See unicodedata_functions comment to the rationale of this macro.
- #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
- static PyObject*
- new_previous_version(PyTypeObject *ucd_type,
- const char*name, const change_record* (*getrecord)(Py_UCS4),
- Py_UCS4 (*normalization)(Py_UCS4))
- {
- PreviousDBVersion *self;
- self = PyObject_GC_New(PreviousDBVersion, ucd_type);
- if (self == NULL)
- return NULL;
- self->name = name;
- self->getrecord = getrecord;
- self->normalization = normalization;
- PyObject_GC_Track(self);
- return (PyObject*)self;
- }
- /* --- Module API --------------------------------------------------------- */
- /*[clinic input]
- unicodedata.UCD.decimal
- self: self
- chr: int(accept={str})
- default: object=NULL
- /
- Converts a Unicode character into its equivalent decimal value.
- Returns the decimal value assigned to the character chr as integer.
- If no such value is defined, default is returned, or, if not given,
- ValueError is raised.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_decimal_impl(PyObject *self, int chr,
- PyObject *default_value)
- /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
- {
- int have_old = 0;
- long rc;
- Py_UCS4 c = (Py_UCS4)chr;
- if (UCD_Check(self)) {
- const change_record *old = get_old_record(self, c);
- if (old->category_changed == 0) {
- /* unassigned */
- have_old = 1;
- rc = -1;
- }
- else if (old->decimal_changed != 0xFF) {
- have_old = 1;
- rc = old->decimal_changed;
- }
- }
- if (!have_old)
- rc = Py_UNICODE_TODECIMAL(c);
- if (rc < 0) {
- if (default_value == NULL) {
- PyErr_SetString(PyExc_ValueError,
- "not a decimal");
- return NULL;
- }
- else {
- return Py_NewRef(default_value);
- }
- }
- return PyLong_FromLong(rc);
- }
- /*[clinic input]
- unicodedata.UCD.digit
- self: self
- chr: int(accept={str})
- default: object=NULL
- /
- Converts a Unicode character into its equivalent digit value.
- Returns the digit value assigned to the character chr as integer.
- If no such value is defined, default is returned, or, if not given,
- ValueError is raised.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
- /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
- {
- long rc;
- Py_UCS4 c = (Py_UCS4)chr;
- rc = Py_UNICODE_TODIGIT(c);
- if (rc < 0) {
- if (default_value == NULL) {
- PyErr_SetString(PyExc_ValueError, "not a digit");
- return NULL;
- }
- else {
- return Py_NewRef(default_value);
- }
- }
- return PyLong_FromLong(rc);
- }
- /*[clinic input]
- unicodedata.UCD.numeric
- self: self
- chr: int(accept={str})
- default: object=NULL
- /
- Converts a Unicode character into its equivalent numeric value.
- Returns the numeric value assigned to the character chr as float.
- If no such value is defined, default is returned, or, if not given,
- ValueError is raised.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_numeric_impl(PyObject *self, int chr,
- PyObject *default_value)
- /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
- {
- int have_old = 0;
- double rc;
- Py_UCS4 c = (Py_UCS4)chr;
- if (UCD_Check(self)) {
- const change_record *old = get_old_record(self, c);
- if (old->category_changed == 0) {
- /* unassigned */
- have_old = 1;
- rc = -1.0;
- }
- else if (old->decimal_changed != 0xFF) {
- have_old = 1;
- rc = old->decimal_changed;
- }
- }
- if (!have_old)
- rc = Py_UNICODE_TONUMERIC(c);
- if (rc == -1.0) {
- if (default_value == NULL) {
- PyErr_SetString(PyExc_ValueError, "not a numeric character");
- return NULL;
- }
- else {
- return Py_NewRef(default_value);
- }
- }
- return PyFloat_FromDouble(rc);
- }
- /*[clinic input]
- unicodedata.UCD.category
- self: self
- chr: int(accept={str})
- /
- Returns the general category assigned to the character chr as string.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_category_impl(PyObject *self, int chr)
- /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
- {
- int index;
- Py_UCS4 c = (Py_UCS4)chr;
- index = (int) _getrecord_ex(c)->category;
- if (UCD_Check(self)) {
- const change_record *old = get_old_record(self, c);
- if (old->category_changed != 0xFF)
- index = old->category_changed;
- }
- return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
- }
- /*[clinic input]
- unicodedata.UCD.bidirectional
- self: self
- chr: int(accept={str})
- /
- Returns the bidirectional class assigned to the character chr as string.
- If no such value is defined, an empty string is returned.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
- /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
- {
- int index;
- Py_UCS4 c = (Py_UCS4)chr;
- index = (int) _getrecord_ex(c)->bidirectional;
- if (UCD_Check(self)) {
- const change_record *old = get_old_record(self, c);
- if (old->category_changed == 0)
- index = 0; /* unassigned */
- else if (old->bidir_changed != 0xFF)
- index = old->bidir_changed;
- }
- return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
- }
- /*[clinic input]
- unicodedata.UCD.combining -> int
- self: self
- chr: int(accept={str})
- /
- Returns the canonical combining class assigned to the character chr as integer.
- Returns 0 if no combining class is defined.
- [clinic start generated code]*/
- static int
- unicodedata_UCD_combining_impl(PyObject *self, int chr)
- /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
- {
- int index;
- Py_UCS4 c = (Py_UCS4)chr;
- index = (int) _getrecord_ex(c)->combining;
- if (UCD_Check(self)) {
- const change_record *old = get_old_record(self, c);
- if (old->category_changed == 0)
- index = 0; /* unassigned */
- }
- return index;
- }
- /*[clinic input]
- unicodedata.UCD.mirrored -> int
- self: self
- chr: int(accept={str})
- /
- Returns the mirrored property assigned to the character chr as integer.
- Returns 1 if the character has been identified as a "mirrored"
- character in bidirectional text, 0 otherwise.
- [clinic start generated code]*/
- static int
- unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
- /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
- {
- int index;
- Py_UCS4 c = (Py_UCS4)chr;
- index = (int) _getrecord_ex(c)->mirrored;
- if (UCD_Check(self)) {
- const change_record *old = get_old_record(self, c);
- if (old->category_changed == 0)
- index = 0; /* unassigned */
- else if (old->mirrored_changed != 0xFF)
- index = old->mirrored_changed;
- }
- return index;
- }
- /*[clinic input]
- unicodedata.UCD.east_asian_width
- self: self
- chr: int(accept={str})
- /
- Returns the east asian width assigned to the character chr as string.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
- /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
- {
- int index;
- Py_UCS4 c = (Py_UCS4)chr;
- index = (int) _getrecord_ex(c)->east_asian_width;
- if (UCD_Check(self)) {
- const change_record *old = get_old_record(self, c);
- if (old->category_changed == 0)
- index = 0; /* unassigned */
- else if (old->east_asian_width_changed != 0xFF)
- index = old->east_asian_width_changed;
- }
- return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
- }
- /*[clinic input]
- unicodedata.UCD.decomposition
- self: self
- chr: int(accept={str})
- /
- Returns the character decomposition mapping assigned to the character chr as string.
- An empty string is returned in case no such mapping is defined.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
- /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
- {
- char decomp[256];
- int code, index, count;
- size_t i;
- unsigned int prefix_index;
- Py_UCS4 c = (Py_UCS4)chr;
- code = (int)c;
- if (UCD_Check(self)) {
- const change_record *old = get_old_record(self, c);
- if (old->category_changed == 0)
- return PyUnicode_FromString(""); /* unassigned */
- }
- if (code < 0 || code >= 0x110000)
- index = 0;
- else {
- index = decomp_index1[(code>>DECOMP_SHIFT)];
- index = decomp_index2[(index<<DECOMP_SHIFT)+
- (code&((1<<DECOMP_SHIFT)-1))];
- }
- /* high byte is number of hex bytes (usually one or two), low byte
- is prefix code (from*/
- count = decomp_data[index] >> 8;
- /* XXX: could allocate the PyString up front instead
- (strlen(prefix) + 5 * count + 1 bytes) */
- /* Based on how index is calculated above and decomp_data is generated
- from Tools/unicode/makeunicodedata.py, it should not be possible
- to overflow decomp_prefix. */
- prefix_index = decomp_data[index] & 255;
- assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
- /* copy prefix */
- i = strlen(decomp_prefix[prefix_index]);
- memcpy(decomp, decomp_prefix[prefix_index], i);
- while (count-- > 0) {
- if (i)
- decomp[i++] = ' ';
- assert(i < sizeof(decomp));
- PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
- decomp_data[++index]);
- i += strlen(decomp + i);
- }
- return PyUnicode_FromStringAndSize(decomp, i);
- }
- static void
- get_decomp_record(PyObject *self, Py_UCS4 code,
- int *index, int *prefix, int *count)
- {
- if (code >= 0x110000) {
- *index = 0;
- }
- else if (UCD_Check(self)
- && get_old_record(self, code)->category_changed==0) {
- /* unassigned in old version */
- *index = 0;
- }
- else {
- *index = decomp_index1[(code>>DECOMP_SHIFT)];
- *index = decomp_index2[(*index<<DECOMP_SHIFT)+
- (code&((1<<DECOMP_SHIFT)-1))];
- }
- /* high byte is number of hex bytes (usually one or two), low byte
- is prefix code (from*/
- *count = decomp_data[*index] >> 8;
- *prefix = decomp_data[*index] & 255;
- (*index)++;
- }
- #define SBase 0xAC00
- #define LBase 0x1100
- #define VBase 0x1161
- #define TBase 0x11A7
- #define LCount 19
- #define VCount 21
- #define TCount 28
- #define NCount (VCount*TCount)
- #define SCount (LCount*NCount)
- static PyObject*
- nfd_nfkd(PyObject *self, PyObject *input, int k)
- {
- PyObject *result;
- Py_UCS4 *output;
- Py_ssize_t i, o, osize;
- int kind;
- const void *data;
- /* Longest decomposition in Unicode 3.2: U+FDFA */
- Py_UCS4 stack[20];
- Py_ssize_t space, isize;
- int index, prefix, count, stackptr;
- unsigned char prev, cur;
- stackptr = 0;
- isize = PyUnicode_GET_LENGTH(input);
- space = isize;
- /* Overallocate at most 10 characters. */
- if (space > 10) {
- if (space <= PY_SSIZE_T_MAX - 10)
- space += 10;
- }
- else {
- space *= 2;
- }
- osize = space;
- output = PyMem_NEW(Py_UCS4, space);
- if (!output) {
- PyErr_NoMemory();
- return NULL;
- }
- i = o = 0;
- kind = PyUnicode_KIND(input);
- data = PyUnicode_DATA(input);
- while (i < isize) {
- stack[stackptr++] = PyUnicode_READ(kind, data, i++);
- while(stackptr) {
- Py_UCS4 code = stack[--stackptr];
- /* Hangul Decomposition adds three characters in
- a single step, so we need at least that much room. */
- if (space < 3) {
- Py_UCS4 *new_output;
- osize += 10;
- space += 10;
- new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
- if (new_output == NULL) {
- PyMem_Free(output);
- PyErr_NoMemory();
- return NULL;
- }
- output = new_output;
- }
- /* Hangul Decomposition. */
- if (SBase <= code && code < (SBase+SCount)) {
- int SIndex = code - SBase;
- int L = LBase + SIndex / NCount;
- int V = VBase + (SIndex % NCount) / TCount;
- int T = TBase + SIndex % TCount;
- output[o++] = L;
- output[o++] = V;
- space -= 2;
- if (T != TBase) {
- output[o++] = T;
- space --;
- }
- continue;
- }
- /* normalization changes */
- if (UCD_Check(self)) {
- Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
- if (value != 0) {
- stack[stackptr++] = value;
- continue;
- }
- }
- /* Other decompositions. */
- get_decomp_record(self, code, &index, &prefix, &count);
- /* Copy character if it is not decomposable, or has a
- compatibility decomposition, but we do NFD. */
- if (!count || (prefix && !k)) {
- output[o++] = code;
- space--;
- continue;
- }
- /* Copy decomposition onto the stack, in reverse
- order. */
- while(count) {
- code = decomp_data[index + (--count)];
- stack[stackptr++] = code;
- }
- }
- }
- result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
- output, o);
- PyMem_Free(output);
- if (!result)
- return NULL;
- /* result is guaranteed to be ready, as it is compact. */
- kind = PyUnicode_KIND(result);
- data = PyUnicode_DATA(result);
- /* Sort canonically. */
- i = 0;
- prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
- for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
- cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
- if (prev == 0 || cur == 0 || prev <= cur) {
- prev = cur;
- continue;
- }
- /* Non-canonical order. Need to switch *i with previous. */
- o = i - 1;
- while (1) {
- Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
- PyUnicode_WRITE(kind, data, o+1,
- PyUnicode_READ(kind, data, o));
- PyUnicode_WRITE(kind, data, o, tmp);
- o--;
- if (o < 0)
- break;
- prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
- if (prev == 0 || prev <= cur)
- break;
- }
- prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
- }
- return result;
- }
- static int
- find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
- {
- unsigned int index;
- for (index = 0; nfc[index].start; index++) {
- unsigned int start = nfc[index].start;
- if (code < start)
- return -1;
- if (code <= start + nfc[index].count) {
- unsigned int delta = code - start;
- return nfc[index].index + delta;
- }
- }
- return -1;
- }
- static PyObject*
- nfc_nfkc(PyObject *self, PyObject *input, int k)
- {
- PyObject *result;
- int kind;
- const void *data;
- Py_UCS4 *output;
- Py_ssize_t i, i1, o, len;
- int f,l,index,index1,comb;
- Py_UCS4 code;
- Py_ssize_t skipped[20];
- int cskipped = 0;
- result = nfd_nfkd(self, input, k);
- if (!result)
- return NULL;
- /* result will be "ready". */
- kind = PyUnicode_KIND(result);
- data = PyUnicode_DATA(result);
- len = PyUnicode_GET_LENGTH(result);
- /* We allocate a buffer for the output.
- If we find that we made no changes, we still return
- the NFD result. */
- output = PyMem_NEW(Py_UCS4, len);
- if (!output) {
- PyErr_NoMemory();
- Py_DECREF(result);
- return 0;
- }
- i = o = 0;
- again:
- while (i < len) {
- for (index = 0; index < cskipped; index++) {
- if (skipped[index] == i) {
- /* *i character is skipped.
- Remove from list. */
- skipped[index] = skipped[cskipped-1];
- cskipped--;
- i++;
- goto again; /* continue while */
- }
- }
- /* Hangul Composition. We don't need to check for <LV,T>
- pairs, since we always have decomposed data. */
- code = PyUnicode_READ(kind, data, i);
- if (LBase <= code && code < (LBase+LCount) &&
- i + 1 < len &&
- VBase <= PyUnicode_READ(kind, data, i+1) &&
- PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
- /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
- and V character is a modern vowel (0x1161 ~ 0x1175). */
- int LIndex, VIndex;
- LIndex = code - LBase;
- VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
- code = SBase + (LIndex*VCount+VIndex)*TCount;
- i+=2;
- if (i < len &&
- TBase < PyUnicode_READ(kind, data, i) &&
- PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
- /* check T character is a modern trailing consonant
- (0x11A8 ~ 0x11C2). */
- code += PyUnicode_READ(kind, data, i)-TBase;
- i++;
- }
- output[o++] = code;
- continue;
- }
- /* code is still input[i] here */
- f = find_nfc_index(nfc_first, code);
- if (f == -1) {
- output[o++] = code;
- i++;
- continue;
- }
- /* Find next unblocked character. */
- i1 = i+1;
- comb = 0;
- /* output base character for now; might be updated later. */
- output[o] = PyUnicode_READ(kind, data, i);
- while (i1 < len) {
- Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
- int comb1 = _getrecord_ex(code1)->combining;
- if (comb) {
- if (comb1 == 0)
- break;
- if (comb >= comb1) {
- /* Character is blocked. */
- i1++;
- continue;
- }
- }
- l = find_nfc_index(nfc_last, code1);
- /* i1 cannot be combined with i. If i1
- is a starter, we don't need to look further.
- Otherwise, record the combining class. */
- if (l == -1) {
- not_combinable:
- if (comb1 == 0)
- break;
- comb = comb1;
- i1++;
- continue;
- }
- index = f*TOTAL_LAST + l;
- index1 = comp_index[index >> COMP_SHIFT];
- code = comp_data[(index1<<COMP_SHIFT)+
- (index&((1<<COMP_SHIFT)-1))];
- if (code == 0)
- goto not_combinable;
- /* Replace the original character. */
- output[o] = code;
- /* Mark the second character unused. */
- assert(cskipped < 20);
- skipped[cskipped++] = i1;
- i1++;
- f = find_nfc_index(nfc_first, output[o]);
- if (f == -1)
- break;
- }
- /* Output character was already written.
- Just advance the indices. */
- o++; i++;
- }
- if (o == len) {
- /* No changes. Return original string. */
- PyMem_Free(output);
- return result;
- }
- Py_DECREF(result);
- result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
- output, o);
- PyMem_Free(output);
- return result;
- }
- // This needs to match the logic in makeunicodedata.py
- // which constructs the quickcheck data.
- typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
- /* Run the Unicode normalization "quickcheck" algorithm.
- *
- * Return YES or NO if quickcheck determines the input is certainly
- * normalized or certainly not, and MAYBE if quickcheck is unable to
- * tell.
- *
- * If `yes_only` is true, then return MAYBE as soon as we determine
- * the answer is not YES.
- *
- * For background and details on the algorithm, see UAX #15:
- * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
- */
- static QuickcheckResult
- is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
- bool yes_only)
- {
- /* UCD 3.2.0 is requested, quickchecks must be disabled. */
- if (UCD_Check(self)) {
- return MAYBE;
- }
- if (PyUnicode_IS_ASCII(input)) {
- return YES;
- }
- Py_ssize_t i, len;
- int kind;
- const void *data;
- unsigned char prev_combining = 0;
- /* The two quickcheck bits at this shift have type QuickcheckResult. */
- int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
- QuickcheckResult result = YES; /* certainly normalized, unless we find something */
- i = 0;
- kind = PyUnicode_KIND(input);
- data = PyUnicode_DATA(input);
- len = PyUnicode_GET_LENGTH(input);
- while (i < len) {
- Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
- const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
- unsigned char combining = record->combining;
- if (combining && prev_combining > combining)
- return NO; /* non-canonical sort order, not normalized */
- prev_combining = combining;
- unsigned char quickcheck_whole = record->normalization_quick_check;
- if (yes_only) {
- if (quickcheck_whole & (3 << quickcheck_shift))
- return MAYBE;
- } else {
- switch ((quickcheck_whole >> quickcheck_shift) & 3) {
- case NO:
- return NO;
- case MAYBE:
- result = MAYBE; /* this string might need normalization */
- }
- }
- }
- return result;
- }
- /*[clinic input]
- unicodedata.UCD.is_normalized
- self: self
- form: unicode
- unistr as input: unicode
- /
- Return whether the Unicode string unistr is in the normal form 'form'.
- Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
- PyObject *input)
- /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
- {
- if (PyUnicode_READY(input) == -1) {
- return NULL;
- }
- if (PyUnicode_GET_LENGTH(input) == 0) {
- /* special case empty input strings. */
- Py_RETURN_TRUE;
- }
- PyObject *result;
- bool nfc = false;
- bool k = false;
- QuickcheckResult m;
- PyObject *cmp;
- int match = 0;
- if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
- nfc = true;
- }
- else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
- nfc = true;
- k = true;
- }
- else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
- /* matches default values for `nfc` and `k` */
- }
- else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
- k = true;
- }
- else {
- PyErr_SetString(PyExc_ValueError, "invalid normalization form");
- return NULL;
- }
- m = is_normalized_quickcheck(self, input, nfc, k, false);
- if (m == MAYBE) {
- cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
- if (cmp == NULL) {
- return NULL;
- }
- match = PyUnicode_Compare(input, cmp);
- Py_DECREF(cmp);
- result = (match == 0) ? Py_True : Py_False;
- }
- else {
- result = (m == YES) ? Py_True : Py_False;
- }
- return Py_NewRef(result);
- }
- /*[clinic input]
- unicodedata.UCD.normalize
- self: self
- form: unicode
- unistr as input: unicode
- /
- Return the normal form 'form' for the Unicode string unistr.
- Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
- PyObject *input)
- /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
- {
- if (PyUnicode_GET_LENGTH(input) == 0) {
- /* Special case empty input strings, since resizing
- them later would cause internal errors. */
- return Py_NewRef(input);
- }
- if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
- if (is_normalized_quickcheck(self, input,
- true, false, true) == YES) {
- return Py_NewRef(input);
- }
- return nfc_nfkc(self, input, 0);
- }
- if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
- if (is_normalized_quickcheck(self, input,
- true, true, true) == YES) {
- return Py_NewRef(input);
- }
- return nfc_nfkc(self, input, 1);
- }
- if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
- if (is_normalized_quickcheck(self, input,
- false, false, true) == YES) {
- return Py_NewRef(input);
- }
- return nfd_nfkd(self, input, 0);
- }
- if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
- if (is_normalized_quickcheck(self, input,
- false, true, true) == YES) {
- return Py_NewRef(input);
- }
- return nfd_nfkd(self, input, 1);
- }
- PyErr_SetString(PyExc_ValueError, "invalid normalization form");
- return NULL;
- }
- /* -------------------------------------------------------------------- */
- /* unicode character name tables */
- /* data file generated by Tools/unicode/makeunicodedata.py */
- #include "unicodename_db.h"
- /* -------------------------------------------------------------------- */
- /* database code (cut and pasted from the unidb package) */
- static unsigned long
- _gethash(const char *s, int len, int scale)
- {
- int i;
- unsigned long h = 0;
- unsigned long ix;
- for (i = 0; i < len; i++) {
- h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
- ix = h & 0xff000000;
- if (ix)
- h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
- }
- return h;
- }
- static const char * const hangul_syllables[][3] = {
- { "G", "A", "" },
- { "GG", "AE", "G" },
- { "N", "YA", "GG" },
- { "D", "YAE", "GS" },
- { "DD", "EO", "N", },
- { "R", "E", "NJ" },
- { "M", "YEO", "NH" },
- { "B", "YE", "D" },
- { "BB", "O", "L" },
- { "S", "WA", "LG" },
- { "SS", "WAE", "LM" },
- { "", "OE", "LB" },
- { "J", "YO", "LS" },
- { "JJ", "U", "LT" },
- { "C", "WEO", "LP" },
- { "K", "WE", "LH" },
- { "T", "WI", "M" },
- { "P", "YU", "B" },
- { "H", "EU", "BS" },
- { 0, "YI", "S" },
- { 0, "I", "SS" },
- { 0, 0, "NG" },
- { 0, 0, "J" },
- { 0, 0, "C" },
- { 0, 0, "K" },
- { 0, 0, "T" },
- { 0, 0, "P" },
- { 0, 0, "H" }
- };
- /* These ranges need to match makeunicodedata.py:cjk_ranges. */
- static int
- is_unified_ideograph(Py_UCS4 code)
- {
- return
- (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
- (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
- (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
- (0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
- (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
- (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
- (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
- (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
- (0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
- }
- /* macros used to determine if the given code point is in the PUA range that
- * we are using to store aliases and named sequences */
- #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
- #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
- (cp < named_sequences_end))
- static int
- _getucname(PyObject *self,
- Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
- {
- /* Find the name associated with the given code point.
- * If with_alias_and_seq is 1, check for names in the Private Use Area 15
- * that we are using for aliases and named sequences. */
- int offset;
- int i;
- int word;
- const unsigned char* w;
- if (code >= 0x110000)
- return 0;
- /* XXX should we just skip all the code points in the PUAs here? */
- if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
- return 0;
- if (UCD_Check(self)) {
- /* in 3.2.0 there are no aliases and named sequences */
- const change_record *old;
- if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
- return 0;
- old = get_old_record(self, code);
- if (old->category_changed == 0) {
- /* unassigned */
- return 0;
- }
- }
- if (SBase <= code && code < SBase+SCount) {
- /* Hangul syllable. */
- int SIndex = code - SBase;
- int L = SIndex / NCount;
- int V = (SIndex % NCount) / TCount;
- int T = SIndex % TCount;
- if (buflen < 27)
- /* Worst case: HANGUL SYLLABLE <10chars>. */
- return 0;
- strcpy(buffer, "HANGUL SYLLABLE ");
- buffer += 16;
- strcpy(buffer, hangul_syllables[L][0]);
- buffer += strlen(hangul_syllables[L][0]);
- strcpy(buffer, hangul_syllables[V][1]);
- buffer += strlen(hangul_syllables[V][1]);
- strcpy(buffer, hangul_syllables[T][2]);
- buffer += strlen(hangul_syllables[T][2]);
- *buffer = '\0';
- return 1;
- }
- if (is_unified_ideograph(code)) {
- if (buflen < 28)
- /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
- return 0;
- sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
- return 1;
- }
- /* get offset into phrasebook */
- offset = phrasebook_offset1[(code>>phrasebook_shift)];
- offset = phrasebook_offset2[(offset<<phrasebook_shift) +
- (code&((1<<phrasebook_shift)-1))];
- if (!offset)
- return 0;
- i = 0;
- for (;;) {
- /* get word index */
- word = phrasebook[offset] - phrasebook_short;
- if (word >= 0) {
- word = (word << 8) + phrasebook[offset+1];
- offset += 2;
- } else
- word = phrasebook[offset++];
- if (i) {
- if (i > buflen)
- return 0; /* buffer overflow */
- buffer[i++] = ' ';
- }
- /* copy word string from lexicon. the last character in the
- word has bit 7 set. the last word in a string ends with
- 0x80 */
- w = lexicon + lexicon_offset[word];
- while (*w < 128) {
- if (i >= buflen)
- return 0; /* buffer overflow */
- buffer[i++] = *w++;
- }
- if (i >= buflen)
- return 0; /* buffer overflow */
- buffer[i++] = *w & 127;
- if (*w == 128)
- break; /* end of word */
- }
- return 1;
- }
- static int
- capi_getucname(Py_UCS4 code,
- char* buffer, int buflen,
- int with_alias_and_seq)
- {
- return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
- }
- static int
- _cmpname(PyObject *self, int code, const char* name, int namelen)
- {
- /* check if code corresponds to the given name */
- int i;
- char buffer[NAME_MAXLEN+1];
- if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
- return 0;
- for (i = 0; i < namelen; i++) {
- if (Py_TOUPPER(name[i]) != buffer[i])
- return 0;
- }
- return buffer[namelen] == '\0';
- }
- static void
- find_syllable(const char *str, int *len, int *pos, int count, int column)
- {
- int i, len1;
- *len = -1;
- for (i = 0; i < count; i++) {
- const char *s = hangul_syllables[i][column];
- len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
- if (len1 <= *len)
- continue;
- if (strncmp(str, s, len1) == 0) {
- *len = len1;
- *pos = i;
- }
- }
- if (*len == -1) {
- *len = 0;
- }
- }
- static int
- _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
- {
- /* check if named sequences are allowed */
- if (!with_named_seq && IS_NAMED_SEQ(cp))
- return 0;
- /* if the code point is in the PUA range that we use for aliases,
- * convert it to obtain the right code point */
- if (IS_ALIAS(cp))
- *code = name_aliases[cp-aliases_start];
- else
- *code = cp;
- return 1;
- }
- static int
- _getcode(PyObject* self,
- const char* name, int namelen, Py_UCS4* code, int with_named_seq)
- {
- /* Return the code point associated with the given name.
- * Named aliases are resolved too (unless self != NULL (i.e. we are using
- * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
- * using for the named sequence, and the caller must then convert it. */
- unsigned int h, v;
- unsigned int mask = code_size-1;
- unsigned int i, incr;
- /* Check for hangul syllables. */
- if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
- int len, L = -1, V = -1, T = -1;
- const char *pos = name + 16;
- find_syllable(pos, &len, &L, LCount, 0);
- pos += len;
- find_syllable(pos, &len, &V, VCount, 1);
- pos += len;
- find_syllable(pos, &len, &T, TCount, 2);
- pos += len;
- if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
- *code = SBase + (L*VCount+V)*TCount + T;
- return 1;
- }
- /* Otherwise, it's an illegal syllable name. */
- return 0;
- }
- /* Check for unified ideographs. */
- if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
- /* Four or five hexdigits must follow. */
- v = 0;
- name += 22;
- namelen -= 22;
- if (namelen != 4 && namelen != 5)
- return 0;
- while (namelen--) {
- v *= 16;
- if (*name >= '0' && *name <= '9')
- v += *name - '0';
- else if (*name >= 'A' && *name <= 'F')
- v += *name - 'A' + 10;
- else
- return 0;
- name++;
- }
- if (!is_unified_ideograph(v))
- return 0;
- *code = v;
- return 1;
- }
- /* the following is the same as python's dictionary lookup, with
- only minor changes. see the makeunicodedata script for more
- details */
- h = (unsigned int) _gethash(name, namelen, code_magic);
- i = (~h) & mask;
- v = code_hash[i];
- if (!v)
- return 0;
- if (_cmpname(self, v, name, namelen)) {
- return _check_alias_and_seq(v, code, with_named_seq);
- }
- incr = (h ^ (h >> 3)) & mask;
- if (!incr)
- incr = mask;
- for (;;) {
- i = (i + incr) & mask;
- v = code_hash[i];
- if (!v)
- return 0;
- if (_cmpname(self, v, name, namelen)) {
- return _check_alias_and_seq(v, code, with_named_seq);
- }
- incr = incr << 1;
- if (incr > mask)
- incr = incr ^ code_poly;
- }
- }
- static int
- capi_getcode(const char* name, int namelen, Py_UCS4* code,
- int with_named_seq)
- {
- return _getcode(NULL, name, namelen, code, with_named_seq);
- }
- static void
- unicodedata_destroy_capi(PyObject *capsule)
- {
- void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
- PyMem_Free(capi);
- }
- static PyObject *
- unicodedata_create_capi(void)
- {
- _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
- if (capi == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- capi->getname = capi_getucname;
- capi->getcode = capi_getcode;
- PyObject *capsule = PyCapsule_New(capi,
- PyUnicodeData_CAPSULE_NAME,
- unicodedata_destroy_capi);
- if (capsule == NULL) {
- PyMem_Free(capi);
- }
- return capsule;
- };
- /* -------------------------------------------------------------------- */
- /* Python bindings */
- /*[clinic input]
- unicodedata.UCD.name
- self: self
- chr: int(accept={str})
- default: object=NULL
- /
- Returns the name assigned to the character chr as a string.
- If no name is defined, default is returned, or, if not given,
- ValueError is raised.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
- /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
- {
- char name[NAME_MAXLEN+1];
- Py_UCS4 c = (Py_UCS4)chr;
- if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
- if (default_value == NULL) {
- PyErr_SetString(PyExc_ValueError, "no such name");
- return NULL;
- }
- else {
- return Py_NewRef(default_value);
- }
- }
- return PyUnicode_FromString(name);
- }
- /*[clinic input]
- unicodedata.UCD.lookup
- self: self
- name: str(accept={str, robuffer}, zeroes=True)
- /
- Look up character by name.
- If a character with the given name is found, return the
- corresponding character. If not found, KeyError is raised.
- [clinic start generated code]*/
- static PyObject *
- unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
- Py_ssize_t name_length)
- /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
- {
- Py_UCS4 code;
- unsigned int index;
- if (name_length > NAME_MAXLEN) {
- PyErr_SetString(PyExc_KeyError, "name too long");
- return NULL;
- }
- if (!_getcode(self, name, (int)name_length, &code, 1)) {
- PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
- return NULL;
- }
- /* check if code is in the PUA range that we use for named sequences
- and convert it */
- if (IS_NAMED_SEQ(code)) {
- index = code-named_sequences_start;
- return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
- named_sequences[index].seq,
- named_sequences[index].seqlen);
- }
- return PyUnicode_FromOrdinal(code);
- }
- // List of functions used to define module functions *AND* unicodedata.UCD
- // methods. For module functions, self is the module. For UCD methods, self
- // is an UCD instance. The UCD_Check() macro is used to check if self is
- // an UCD instance.
- static PyMethodDef unicodedata_functions[] = {
- UNICODEDATA_UCD_DECIMAL_METHODDEF
- UNICODEDATA_UCD_DIGIT_METHODDEF
- UNICODEDATA_UCD_NUMERIC_METHODDEF
- UNICODEDATA_UCD_CATEGORY_METHODDEF
- UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
- UNICODEDATA_UCD_COMBINING_METHODDEF
- UNICODEDATA_UCD_MIRRORED_METHODDEF
- UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
- UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
- UNICODEDATA_UCD_NAME_METHODDEF
- UNICODEDATA_UCD_LOOKUP_METHODDEF
- UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
- UNICODEDATA_UCD_NORMALIZE_METHODDEF
- {NULL, NULL} /* sentinel */
- };
- static int
- ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
- {
- Py_VISIT(Py_TYPE(self));
- return 0;
- }
- static void
- ucd_dealloc(PreviousDBVersion *self)
- {
- PyTypeObject *tp = Py_TYPE(self);
- PyObject_GC_UnTrack(self);
- PyObject_GC_Del(self);
- Py_DECREF(tp);
- }
- static PyType_Slot ucd_type_slots[] = {
- {Py_tp_dealloc, ucd_dealloc},
- {Py_tp_traverse, ucd_traverse},
- {Py_tp_getattro, PyObject_GenericGetAttr},
- {Py_tp_methods, unicodedata_functions},
- {Py_tp_members, DB_members},
- {0, 0}
- };
- static PyType_Spec ucd_type_spec = {
- .name = "unicodedata.UCD",
- .basicsize = sizeof(PreviousDBVersion),
- .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
- Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
- .slots = ucd_type_slots
- };
- PyDoc_STRVAR(unicodedata_docstring,
- "This module provides access to the Unicode Character Database which\n\
- defines character properties for all Unicode characters. The data in\n\
- this database is based on the UnicodeData.txt file version\n\
- " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
- \n\
- The module uses the same names and symbols as defined by the\n\
- UnicodeData File Format " UNIDATA_VERSION ".");
- static int
- unicodedata_exec(PyObject *module)
- {
- if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
- return -1;
- }
- PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
- if (ucd_type == NULL) {
- return -1;
- }
- if (PyModule_AddType(module, ucd_type) < 0) {
- Py_DECREF(ucd_type);
- return -1;
- }
- // Unicode database version 3.2.0 used by the IDNA encoding
- PyObject *v;
- v = new_previous_version(ucd_type, "3.2.0",
- get_change_3_2_0, normalization_3_2_0);
- Py_DECREF(ucd_type);
- if (v == NULL) {
- return -1;
- }
- if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
- Py_DECREF(v);
- return -1;
- }
- /* Export C API */
- PyObject *capsule = unicodedata_create_capi();
- if (capsule == NULL) {
- return -1;
- }
- int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
- Py_DECREF(capsule);
- if (rc < 0) {
- return -1;
- }
- return 0;
- }
- static PyModuleDef_Slot unicodedata_slots[] = {
- {Py_mod_exec, unicodedata_exec},
- {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
- {0, NULL}
- };
- static struct PyModuleDef unicodedata_module = {
- PyModuleDef_HEAD_INIT,
- .m_name = "unicodedata",
- .m_doc = unicodedata_docstring,
- .m_size = 0,
- .m_methods = unicodedata_functions,
- .m_slots = unicodedata_slots,
- };
- PyMODINIT_FUNC
- PyInit_unicodedata(void)
- {
- return PyModuleDef_Init(&unicodedata_module);
- }
- /*
- Local variables:
- c-basic-offset: 4
- indent-tabs-mode: nil
- End:
- */
|