12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- ******************************************************************************
- *
- * Copyright (C) 1999-2014, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- ******************************************************************************
- * file name: unames.c
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 1999oct04
- * created by: Markus W. Scherer
- */
- #include "unicode/utypes.h"
- #include "unicode/putil.h"
- #include "unicode/uchar.h"
- #include "unicode/udata.h"
- #include "unicode/utf.h"
- #include "unicode/utf16.h"
- #include "uassert.h"
- #include "ustr_imp.h"
- #include "umutex.h"
- #include "cmemory.h"
- #include "cstring.h"
- #include "ucln_cmn.h"
- #include "udataswp.h"
- #include "uprops.h"
- U_NAMESPACE_BEGIN
- /* prototypes ------------------------------------------------------------- */
- static const char DATA_NAME[] = "unames";
- static const char DATA_TYPE[] = "icu";
- #define GROUP_SHIFT 5
- #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
- #define GROUP_MASK (LINES_PER_GROUP-1)
- /*
- * This struct was replaced by explicitly accessing equivalent
- * fields from triples of uint16_t.
- * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
- * which broke the assumption that sizeof(Group)==6 and that the ++ operator
- * would advance by 6 bytes (3 uint16_t).
- *
- * We can't just change the data structure because it's loaded from a data file,
- * and we don't want to make it less compact, so we changed the access code.
- *
- * For details see ICU tickets 6331 and 6008.
- typedef struct {
- uint16_t groupMSB,
- offsetHigh, offsetLow; / * avoid padding * /
- } Group;
- */
- enum {
- GROUP_MSB,
- GROUP_OFFSET_HIGH,
- GROUP_OFFSET_LOW,
- GROUP_LENGTH
- };
- /*
- * Get the 32-bit group offset.
- * @param group (const uint16_t *) pointer to a Group triple of uint16_t
- * @return group offset (int32_t)
- */
- #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
- #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
- #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
- typedef struct {
- uint32_t start, end;
- uint8_t type, variant;
- uint16_t size;
- } AlgorithmicRange;
- typedef struct {
- uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
- } UCharNames;
- /*
- * Get the groups table from a UCharNames struct.
- * The groups table consists of one uint16_t groupCount followed by
- * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
- * and the comment for the old struct Group above.
- *
- * @param names (const UCharNames *) pointer to the UCharNames indexes
- * @return (const uint16_t *) pointer to the groups table
- */
- #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
- typedef struct {
- const char *otherName;
- UChar32 code;
- } FindName;
- #define DO_FIND_NAME nullptr
- static UDataMemory *uCharNamesData=nullptr;
- static UCharNames *uCharNames=nullptr;
- static icu::UInitOnce gCharNamesInitOnce {};
- /*
- * Maximum length of character names (regular & 1.0).
- */
- static int32_t gMaxNameLength=0;
- /*
- * Set of chars used in character names (regular & 1.0).
- * Chars are platform-dependent (can be EBCDIC).
- */
- static uint32_t gNameSet[8]={ 0 };
- #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
- #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
- #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
- #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
- static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
- "unassigned",
- "uppercase letter",
- "lowercase letter",
- "titlecase letter",
- "modifier letter",
- "other letter",
- "non spacing mark",
- "enclosing mark",
- "combining spacing mark",
- "decimal digit number",
- "letter number",
- "other number",
- "space separator",
- "line separator",
- "paragraph separator",
- "control",
- "format",
- "private use area",
- "surrogate",
- "dash punctuation",
- "start punctuation",
- "end punctuation",
- "connector punctuation",
- "other punctuation",
- "math symbol",
- "currency symbol",
- "modifier symbol",
- "other symbol",
- "initial punctuation",
- "final punctuation",
- "noncharacter",
- "lead surrogate",
- "trail surrogate"
- };
- /* implementation ----------------------------------------------------------- */
- static UBool U_CALLCONV unames_cleanup()
- {
- if(uCharNamesData) {
- udata_close(uCharNamesData);
- uCharNamesData = nullptr;
- }
- if(uCharNames) {
- uCharNames = nullptr;
- }
- gCharNamesInitOnce.reset();
- gMaxNameLength=0;
- return true;
- }
- static UBool U_CALLCONV
- isAcceptable(void * /*context*/,
- const char * /*type*/, const char * /*name*/,
- const UDataInfo *pInfo) {
- return
- pInfo->size>=20 &&
- pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
- pInfo->charsetFamily==U_CHARSET_FAMILY &&
- pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
- pInfo->dataFormat[1]==0x6e &&
- pInfo->dataFormat[2]==0x61 &&
- pInfo->dataFormat[3]==0x6d &&
- pInfo->formatVersion[0]==1;
- }
- static void U_CALLCONV
- loadCharNames(UErrorCode &status) {
- U_ASSERT(uCharNamesData == nullptr);
- U_ASSERT(uCharNames == nullptr);
- uCharNamesData = udata_openChoice(nullptr, DATA_TYPE, DATA_NAME, isAcceptable, nullptr, &status);
- if(U_FAILURE(status)) {
- uCharNamesData = nullptr;
- } else {
- uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
- }
- ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
- }
- static UBool
- isDataLoaded(UErrorCode *pErrorCode) {
- umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
- return U_SUCCESS(*pErrorCode);
- }
- #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) UPRV_BLOCK_MACRO_BEGIN { \
- if((bufferLength)>0) { \
- *(buffer)++=c; \
- --(bufferLength); \
- } \
- ++(bufferPos); \
- } UPRV_BLOCK_MACRO_END
- #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
- /*
- * Important: expandName() and compareName() are almost the same -
- * apply fixes to both.
- *
- * UnicodeData.txt uses ';' as a field separator, so no
- * field can contain ';' as part of its contents.
- * In unames.dat, it is marked as token[';']==-1 only if the
- * semicolon is used in the data file - which is iff we
- * have Unicode 1.0 names or ISO comments or aliases.
- * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
- * although we know that it will never be part of a name.
- */
- static uint16_t
- expandName(UCharNames *names,
- const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength) {
- uint16_t* tokens = reinterpret_cast<uint16_t*>(names) + 8;
- uint16_t token, tokenCount=*tokens++, bufferPos=0;
- uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(names) + names->tokenStringOffset;
- uint8_t c;
- if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
- /*
- * skip the modern name if it is not requested _and_
- * if the semicolon byte value is a character, not a token number
- */
- if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
- int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
- do {
- while(nameLength>0) {
- --nameLength;
- if(*name++==';') {
- break;
- }
- }
- } while(--fieldIndex>0);
- } else {
- /*
- * the semicolon byte value is a token number, therefore
- * only modern names are stored in unames.dat and there is no
- * such requested alternate name here
- */
- nameLength=0;
- }
- }
- /* write each letter directly, and write a token word per token */
- while(nameLength>0) {
- --nameLength;
- c=*name++;
- if(c>=tokenCount) {
- if(c!=';') {
- /* implicit letter */
- WRITE_CHAR(buffer, bufferLength, bufferPos, c);
- } else {
- /* finished */
- break;
- }
- } else {
- token=tokens[c];
- if (token == static_cast<uint16_t>(-2)) {
- /* this is a lead byte for a double-byte token */
- token=tokens[c<<8|*name++];
- --nameLength;
- }
- if (token == static_cast<uint16_t>(-1)) {
- if(c!=';') {
- /* explicit letter */
- WRITE_CHAR(buffer, bufferLength, bufferPos, c);
- } else {
- /* stop, but skip the semicolon if we are seeking
- extended names and there was no 2.0 name but there
- is a 1.0 name. */
- if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
- if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
- continue;
- }
- }
- /* finished */
- break;
- }
- } else {
- /* write token word */
- uint8_t *tokenString=tokenStrings+token;
- while((c=*tokenString++)!=0) {
- WRITE_CHAR(buffer, bufferLength, bufferPos, c);
- }
- }
- }
- }
- /* zero-terminate */
- if(bufferLength>0) {
- *buffer=0;
- }
- return bufferPos;
- }
- /*
- * compareName() is almost the same as expandName() except that it compares
- * the currently expanded name to an input name.
- * It returns the match/no match result as soon as possible.
- */
- static UBool
- compareName(UCharNames *names,
- const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
- const char *otherName) {
- uint16_t* tokens = reinterpret_cast<uint16_t*>(names) + 8;
- uint16_t token, tokenCount=*tokens++;
- uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(names) + names->tokenStringOffset;
- uint8_t c;
- const char *origOtherName = otherName;
- if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
- /*
- * skip the modern name if it is not requested _and_
- * if the semicolon byte value is a character, not a token number
- */
- if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
- int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
- do {
- while(nameLength>0) {
- --nameLength;
- if(*name++==';') {
- break;
- }
- }
- } while(--fieldIndex>0);
- } else {
- /*
- * the semicolon byte value is a token number, therefore
- * only modern names are stored in unames.dat and there is no
- * such requested alternate name here
- */
- nameLength=0;
- }
- }
- /* compare each letter directly, and compare a token word per token */
- while(nameLength>0) {
- --nameLength;
- c=*name++;
- if(c>=tokenCount) {
- if(c!=';') {
- /* implicit letter */
- if (static_cast<char>(c) != *otherName++) {
- return false;
- }
- } else {
- /* finished */
- break;
- }
- } else {
- token=tokens[c];
- if (token == static_cast<uint16_t>(-2)) {
- /* this is a lead byte for a double-byte token */
- token=tokens[c<<8|*name++];
- --nameLength;
- }
- if (token == static_cast<uint16_t>(-1)) {
- if(c!=';') {
- /* explicit letter */
- if (static_cast<char>(c) != *otherName++) {
- return false;
- }
- } else {
- /* stop, but skip the semicolon if we are seeking
- extended names and there was no 2.0 name but there
- is a 1.0 name. */
- if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
- if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
- continue;
- }
- }
- /* finished */
- break;
- }
- } else {
- /* write token word */
- uint8_t *tokenString=tokenStrings+token;
- while((c=*tokenString++)!=0) {
- if (static_cast<char>(c) != *otherName++) {
- return false;
- }
- }
- }
- }
- }
- /* complete match? */
- return *otherName == 0;
- }
- static uint8_t getCharCat(UChar32 cp) {
- uint8_t cat;
- if (U_IS_UNICODE_NONCHAR(cp)) {
- return U_NONCHARACTER_CODE_POINT;
- }
- if ((cat = u_charType(cp)) == U_SURROGATE) {
- cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
- }
- return cat;
- }
- static const char *getCharCatName(UChar32 cp) {
- uint8_t cat = getCharCat(cp);
- /* Return unknown if the table of names above is not up to
- date. */
- if (cat >= UPRV_LENGTHOF(charCatNames)) {
- return "unknown";
- } else {
- return charCatNames[cat];
- }
- }
- static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
- const char *catname = getCharCatName(code);
- uint16_t length = 0;
- UChar32 cp;
- int ndigits, i;
-
- WRITE_CHAR(buffer, bufferLength, length, '<');
- while (catname[length - 1]) {
- WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
- }
- WRITE_CHAR(buffer, bufferLength, length, '-');
- for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
- ;
- if (ndigits < 4)
- ndigits = 4;
- for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
- uint8_t v = static_cast<uint8_t>(cp & 0xf);
- buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
- }
- buffer += ndigits;
- length += static_cast<uint16_t>(ndigits);
- WRITE_CHAR(buffer, bufferLength, length, '>');
- return length;
- }
- /*
- * getGroup() does a binary search for the group that contains the
- * Unicode code point "code".
- * The return value is always a valid Group* that may contain "code"
- * or else is the highest group before "code".
- * If the lowest group is after "code", then that one is returned.
- */
- static const uint16_t *
- getGroup(UCharNames *names, uint32_t code) {
- const uint16_t *groups=GET_GROUPS(names);
- uint16_t groupMSB = static_cast<uint16_t>(code >> GROUP_SHIFT),
- start=0,
- limit=*groups++,
- number;
- /* binary search for the group of names that contains the one for code */
- while(start<limit-1) {
- number = static_cast<uint16_t>((start + limit) / 2);
- if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
- limit=number;
- } else {
- start=number;
- }
- }
- /* return this regardless of whether it is an exact match */
- return groups+start*GROUP_LENGTH;
- }
- /*
- * expandGroupLengths() reads a block of compressed lengths of 32 strings and
- * expands them into offsets and lengths for each string.
- * Lengths are stored with a variable-width encoding in consecutive nibbles:
- * If a nibble<0xc, then it is the length itself (0=empty string).
- * If a nibble>=0xc, then it forms a length value with the following nibble.
- * Calculation see below.
- * The offsets and lengths arrays must be at least 33 (one more) long because
- * there is no check here at the end if the last nibble is still used.
- */
- static const uint8_t *
- expandGroupLengths(const uint8_t *s,
- uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
- /* read the lengths of the 32 strings in this group and get each string's offset */
- uint16_t i=0, offset=0, length=0;
- uint8_t lengthByte;
- /* all 32 lengths must be read to get the offset of the first group string */
- while(i<LINES_PER_GROUP) {
- lengthByte=*s++;
- /* read even nibble - MSBs of lengthByte */
- if(length>=12) {
- /* double-nibble length spread across two bytes */
- length = static_cast<uint16_t>(((length & 0x3) << 4 | lengthByte >> 4) + 12);
- lengthByte&=0xf;
- } else if((lengthByte /* &0xf0 */)>=0xc0) {
- /* double-nibble length spread across this one byte */
- length = static_cast<uint16_t>((lengthByte & 0x3f) + 12);
- } else {
- /* single-nibble length in MSBs */
- length = static_cast<uint16_t>(lengthByte >> 4);
- lengthByte&=0xf;
- }
- *offsets++=offset;
- *lengths++=length;
- offset+=length;
- ++i;
- /* read odd nibble - LSBs of lengthByte */
- if((lengthByte&0xf0)==0) {
- /* this nibble was not consumed for a double-nibble length above */
- length=lengthByte;
- if(length<12) {
- /* single-nibble length in LSBs */
- *offsets++=offset;
- *lengths++=length;
- offset+=length;
- ++i;
- }
- } else {
- length=0; /* prevent double-nibble detection in the next iteration */
- }
- }
- /* now, s is at the first group string */
- return s;
- }
- static uint16_t
- expandGroupName(UCharNames *names, const uint16_t *group,
- uint16_t lineNumber, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength) {
- uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
- const uint8_t* s = reinterpret_cast<uint8_t*>(names) + names->groupStringOffset + GET_GROUP_OFFSET(group);
- s=expandGroupLengths(s, offsets, lengths);
- return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
- buffer, bufferLength);
- }
- static uint16_t
- getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength) {
- const uint16_t *group=getGroup(names, code);
- if (static_cast<uint16_t>(code >> GROUP_SHIFT) == group[GROUP_MSB]) {
- return expandGroupName(names, group, static_cast<uint16_t>(code & GROUP_MASK), nameChoice,
- buffer, bufferLength);
- } else {
- /* group not found */
- /* zero-terminate */
- if(bufferLength>0) {
- *buffer=0;
- }
- return 0;
- }
- }
- /*
- * enumGroupNames() enumerates all the names in a 32-group
- * and either calls the enumerator function or finds a given input name.
- */
- static UBool
- enumGroupNames(UCharNames *names, const uint16_t *group,
- UChar32 start, UChar32 end,
- UEnumCharNamesFn *fn, void *context,
- UCharNameChoice nameChoice) {
- uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
- const uint8_t* s = reinterpret_cast<uint8_t*>(names) + names->groupStringOffset + GET_GROUP_OFFSET(group);
- s=expandGroupLengths(s, offsets, lengths);
- if(fn!=DO_FIND_NAME) {
- char buffer[200];
- uint16_t length;
- while(start<=end) {
- length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
- if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
- buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
- }
- /* here, we assume that the buffer is large enough */
- if(length>0) {
- if(!fn(context, start, nameChoice, buffer, length)) {
- return false;
- }
- }
- ++start;
- }
- } else {
- const char* otherName = static_cast<FindName*>(context)->otherName;
- while(start<=end) {
- if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
- static_cast<FindName*>(context)->code = start;
- return false;
- }
- ++start;
- }
- }
- return true;
- }
- /*
- * enumExtNames enumerate extended names.
- * It only needs to do it if it is called with a real function and not
- * with the dummy DO_FIND_NAME, because u_charFromName() does a check
- * for extended names by itself.
- */
- static UBool
- enumExtNames(UChar32 start, UChar32 end,
- UEnumCharNamesFn *fn, void *context)
- {
- if(fn!=DO_FIND_NAME) {
- char buffer[200];
- uint16_t length;
-
- while(start<=end) {
- buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
- /* here, we assume that the buffer is large enough */
- if(length>0) {
- if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
- return false;
- }
- }
- ++start;
- }
- }
- return true;
- }
- static UBool
- enumNames(UCharNames *names,
- UChar32 start, UChar32 limit,
- UEnumCharNamesFn *fn, void *context,
- UCharNameChoice nameChoice) {
- uint16_t startGroupMSB, endGroupMSB, groupCount;
- const uint16_t *group, *groupLimit;
- startGroupMSB = static_cast<uint16_t>(start >> GROUP_SHIFT);
- endGroupMSB = static_cast<uint16_t>((limit - 1) >> GROUP_SHIFT);
- /* find the group that contains start, or the highest before it */
- group=getGroup(names, start);
- if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
- /* enumerate synthetic names between start and the group start */
- UChar32 extLimit = static_cast<UChar32>(group[GROUP_MSB]) << GROUP_SHIFT;
- if(extLimit>limit) {
- extLimit=limit;
- }
- if(!enumExtNames(start, extLimit-1, fn, context)) {
- return false;
- }
- start=extLimit;
- }
- if(startGroupMSB==endGroupMSB) {
- if(startGroupMSB==group[GROUP_MSB]) {
- /* if start and limit-1 are in the same group, then enumerate only in that one */
- return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
- }
- } else {
- const uint16_t *groups=GET_GROUPS(names);
- groupCount=*groups++;
- groupLimit=groups+groupCount*GROUP_LENGTH;
- if(startGroupMSB==group[GROUP_MSB]) {
- /* enumerate characters in the partial start group */
- if((start&GROUP_MASK)!=0) {
- if(!enumGroupNames(names, group,
- start, (static_cast<UChar32>(startGroupMSB) << GROUP_SHIFT) + LINES_PER_GROUP - 1,
- fn, context, nameChoice)) {
- return false;
- }
- group=NEXT_GROUP(group); /* continue with the next group */
- }
- } else if(startGroupMSB>group[GROUP_MSB]) {
- /* make sure that we start enumerating with the first group after start */
- const uint16_t *nextGroup=NEXT_GROUP(group);
- if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
- UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
- if (end > limit) {
- end = limit;
- }
- if (!enumExtNames(start, end - 1, fn, context)) {
- return false;
- }
- }
- group=nextGroup;
- }
- /* enumerate entire groups between the start- and end-groups */
- while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
- const uint16_t *nextGroup;
- start = static_cast<UChar32>(group[GROUP_MSB]) << GROUP_SHIFT;
- if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
- return false;
- }
- nextGroup=NEXT_GROUP(group);
- if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
- UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
- if (end > limit) {
- end = limit;
- }
- if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
- return false;
- }
- }
- group=nextGroup;
- }
- /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
- if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
- return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
- } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
- UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
- if (next > start) {
- start = next;
- }
- } else {
- return true;
- }
- }
- /* we have not found a group, which means everything is made of
- extended names. */
- if (nameChoice == U_EXTENDED_CHAR_NAME) {
- if (limit > UCHAR_MAX_VALUE + 1) {
- limit = UCHAR_MAX_VALUE + 1;
- }
- return enumExtNames(start, limit - 1, fn, context);
- }
-
- return true;
- }
- static uint16_t
- writeFactorSuffix(const uint16_t *factors, uint16_t count,
- const char *s, /* suffix elements */
- uint32_t code,
- uint16_t indexes[8], /* output fields from here */
- const char *elementBases[8], const char *elements[8],
- char *buffer, uint16_t bufferLength) {
- uint16_t i, factor, bufferPos=0;
- char c;
- /* write elements according to the factors */
- /*
- * the factorized elements are determined by modulo arithmetic
- * with the factors of this algorithm
- *
- * note that for fewer operations, count is decremented here
- */
- --count;
- for(i=count; i>0; --i) {
- factor=factors[i];
- indexes[i] = static_cast<uint16_t>(code % factor);
- code/=factor;
- }
- /*
- * we don't need to calculate the last modulus because start<=code<=end
- * guarantees here that code<=factors[0]
- */
- indexes[0] = static_cast<uint16_t>(code);
- /* write each element */
- for(;;) {
- if(elementBases!=nullptr) {
- *elementBases++=s;
- }
- /* skip indexes[i] strings */
- factor=indexes[i];
- while(factor>0) {
- while(*s++!=0) {}
- --factor;
- }
- if(elements!=nullptr) {
- *elements++=s;
- }
- /* write element */
- while((c=*s++)!=0) {
- WRITE_CHAR(buffer, bufferLength, bufferPos, c);
- }
- /* we do not need to perform the rest of this loop for i==count - break here */
- if(i>=count) {
- break;
- }
- /* skip the rest of the strings for this factors[i] */
- factor = static_cast<uint16_t>(factors[i] - indexes[i] - 1);
- while(factor>0) {
- while(*s++!=0) {}
- --factor;
- }
- ++i;
- }
- /* zero-terminate */
- if(bufferLength>0) {
- *buffer=0;
- }
- return bufferPos;
- }
- /*
- * Important:
- * Parts of findAlgName() are almost the same as some of getAlgName().
- * Fixes must be applied to both.
- */
- static uint16_t
- getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
- char *buffer, uint16_t bufferLength) {
- uint16_t bufferPos=0;
- /* Only the normative character name can be algorithmic. */
- if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
- /* zero-terminate */
- if(bufferLength>0) {
- *buffer=0;
- }
- return 0;
- }
- switch(range->type) {
- case 0: {
- /* name = prefix hex-digits */
- const char* s = reinterpret_cast<const char*>(range + 1);
- char c;
- uint16_t i, count;
- /* copy prefix */
- while((c=*s++)!=0) {
- WRITE_CHAR(buffer, bufferLength, bufferPos, c);
- }
- /* write hexadecimal code point value */
- count=range->variant;
- /* zero-terminate */
- if(count<bufferLength) {
- buffer[count]=0;
- }
- for(i=count; i>0;) {
- if(--i<bufferLength) {
- c = static_cast<char>(code & 0xf);
- if(c<10) {
- c+='0';
- } else {
- c+='A'-10;
- }
- buffer[i]=c;
- }
- code>>=4;
- }
- bufferPos+=count;
- break;
- }
- case 1: {
- /* name = prefix factorized-elements */
- uint16_t indexes[8];
- const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
- uint16_t count=range->variant;
- const char* s = reinterpret_cast<const char*>(factors + count);
- char c;
- /* copy prefix */
- while((c=*s++)!=0) {
- WRITE_CHAR(buffer, bufferLength, bufferPos, c);
- }
- bufferPos+=writeFactorSuffix(factors, count,
- s, code-range->start, indexes, nullptr, nullptr, buffer, bufferLength);
- break;
- }
- default:
- /* undefined type */
- /* zero-terminate */
- if(bufferLength>0) {
- *buffer=0;
- }
- break;
- }
- return bufferPos;
- }
- /*
- * Important: enumAlgNames() and findAlgName() are almost the same.
- * Any fix must be applied to both.
- */
- static UBool
- enumAlgNames(AlgorithmicRange *range,
- UChar32 start, UChar32 limit,
- UEnumCharNamesFn *fn, void *context,
- UCharNameChoice nameChoice) {
- char buffer[200];
- uint16_t length;
- if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
- return true;
- }
- switch(range->type) {
- case 0: {
- char *s, *end;
- char c;
- /* get the full name of the start character */
- length = getAlgName(range, static_cast<uint32_t>(start), nameChoice, buffer, sizeof(buffer));
- if(length<=0) {
- return true;
- }
- /* call the enumerator function with this first character */
- if(!fn(context, start, nameChoice, buffer, length)) {
- return false;
- }
- /* go to the end of the name; all these names have the same length */
- end=buffer;
- while(*end!=0) {
- ++end;
- }
- /* enumerate the rest of the names */
- while(++start<limit) {
- /* increment the hexadecimal number on a character-basis */
- s=end;
- for (;;) {
- c=*--s;
- if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
- *s = static_cast<char>(c + 1);
- break;
- } else if(c=='9') {
- *s='A';
- break;
- } else if(c=='F') {
- *s='0';
- }
- }
- if(!fn(context, start, nameChoice, buffer, length)) {
- return false;
- }
- }
- break;
- }
- case 1: {
- uint16_t indexes[8];
- const char *elementBases[8], *elements[8];
- const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
- uint16_t count=range->variant;
- const char* s = reinterpret_cast<const char*>(factors + count);
- char *suffix, *t;
- uint16_t prefixLength, i, idx;
- char c;
- /* name = prefix factorized-elements */
- /* copy prefix */
- suffix=buffer;
- prefixLength=0;
- while((c=*s++)!=0) {
- *suffix++=c;
- ++prefixLength;
- }
- /* append the suffix of the start character */
- length = static_cast<uint16_t>(prefixLength + writeFactorSuffix(factors, count,
- s, static_cast<uint32_t>(start) - range->start,
- indexes, elementBases, elements,
- suffix, static_cast<uint16_t>(sizeof(buffer) - prefixLength)));
- /* call the enumerator function with this first character */
- if(!fn(context, start, nameChoice, buffer, length)) {
- return false;
- }
- /* enumerate the rest of the names */
- while(++start<limit) {
- /* increment the indexes in lexical order bound by the factors */
- i=count;
- for (;;) {
- idx = static_cast<uint16_t>(indexes[--i] + 1);
- if(idx<factors[i]) {
- /* skip one index and its element string */
- indexes[i]=idx;
- s=elements[i];
- while(*s++!=0) {
- }
- elements[i]=s;
- break;
- } else {
- /* reset this index to 0 and its element string to the first one */
- indexes[i]=0;
- elements[i]=elementBases[i];
- }
- }
- /* to make matters a little easier, just append all elements to the suffix */
- t=suffix;
- length=prefixLength;
- for(i=0; i<count; ++i) {
- s=elements[i];
- while((c=*s++)!=0) {
- *t++=c;
- ++length;
- }
- }
- /* zero-terminate */
- *t=0;
- if(!fn(context, start, nameChoice, buffer, length)) {
- return false;
- }
- }
- break;
- }
- default:
- /* undefined type */
- break;
- }
- return true;
- }
- /*
- * findAlgName() is almost the same as enumAlgNames() except that it
- * returns the code point for a name if it fits into the range.
- * It returns 0xffff otherwise.
- */
- static UChar32
- findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
- UChar32 code;
- if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
- return 0xffff;
- }
- switch(range->type) {
- case 0: {
- /* name = prefix hex-digits */
- const char* s = reinterpret_cast<const char*>(range + 1);
- char c;
- uint16_t i, count;
- /* compare prefix */
- while((c=*s++)!=0) {
- if (c != *otherName++) {
- return 0xffff;
- }
- }
- /* read hexadecimal code point value */
- count=range->variant;
- code=0;
- for(i=0; i<count; ++i) {
- c=*otherName++;
- if('0'<=c && c<='9') {
- code=(code<<4)|(c-'0');
- } else if('A'<=c && c<='F') {
- code=(code<<4)|(c-'A'+10);
- } else {
- return 0xffff;
- }
- }
- /* does it fit into the range? */
- if (*otherName == 0 && range->start <= static_cast<uint32_t>(code) && static_cast<uint32_t>(code) <= range->end) {
- return code;
- }
- break;
- }
- case 1: {
- char buffer[64];
- uint16_t indexes[8];
- const char *elementBases[8], *elements[8];
- const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
- uint16_t count=range->variant;
- const char *s = reinterpret_cast<const char*>(factors + count), *t;
- UChar32 start, limit;
- uint16_t i, idx;
- char c;
- /* name = prefix factorized-elements */
- /* compare prefix */
- while((c=*s++)!=0) {
- if (c != *otherName++) {
- return 0xffff;
- }
- }
- start = static_cast<UChar32>(range->start);
- limit = static_cast<UChar32>(range->end + 1);
- /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
- writeFactorSuffix(factors, count, s, 0,
- indexes, elementBases, elements, buffer, sizeof(buffer));
- /* compare the first suffix */
- if(0==uprv_strcmp(otherName, buffer)) {
- return start;
- }
- /* enumerate and compare the rest of the suffixes */
- while(++start<limit) {
- /* increment the indexes in lexical order bound by the factors */
- i=count;
- for (;;) {
- idx = static_cast<uint16_t>(indexes[--i] + 1);
- if(idx<factors[i]) {
- /* skip one index and its element string */
- indexes[i]=idx;
- s=elements[i];
- while(*s++!=0) {}
- elements[i]=s;
- break;
- } else {
- /* reset this index to 0 and its element string to the first one */
- indexes[i]=0;
- elements[i]=elementBases[i];
- }
- }
- /* to make matters a little easier, just compare all elements of the suffix */
- t=otherName;
- for(i=0; i<count; ++i) {
- s=elements[i];
- while((c=*s++)!=0) {
- if(c!=*t++) {
- s=""; /* does not match */
- i=99;
- }
- }
- }
- if(i<99 && *t==0) {
- return start;
- }
- }
- break;
- }
- default:
- /* undefined type */
- break;
- }
- return 0xffff;
- }
- /* sets of name characters, maximum name lengths ---------------------------- */
- #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
- #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
- static int32_t
- calcStringSetLength(uint32_t set[8], const char *s) {
- int32_t length=0;
- char c;
- while((c=*s++)!=0) {
- SET_ADD(set, c);
- ++length;
- }
- return length;
- }
- static int32_t
- calcAlgNameSetsLengths(int32_t maxNameLength) {
- AlgorithmicRange *range;
- uint32_t *p;
- uint32_t rangeCount;
- int32_t length;
- /* enumerate algorithmic ranges */
- p = reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->algNamesOffset);
- rangeCount=*p;
- range = reinterpret_cast<AlgorithmicRange*>(p + 1);
- while(rangeCount>0) {
- switch(range->type) {
- case 0:
- /* name = prefix + (range->variant times) hex-digits */
- /* prefix */
- length = calcStringSetLength(gNameSet, reinterpret_cast<const char*>(range + 1)) + range->variant;
- if(length>maxNameLength) {
- maxNameLength=length;
- }
- break;
- case 1: {
- /* name = prefix factorized-elements */
- const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
- const char *s;
- int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
- /* prefix length */
- s = reinterpret_cast<const char*>(factors + count);
- length=calcStringSetLength(gNameSet, s);
- s+=length+1; /* start of factor suffixes */
- /* get the set and maximum factor suffix length for each factor */
- for(i=0; i<count; ++i) {
- maxFactorLength=0;
- for(factor=factors[i]; factor>0; --factor) {
- factorLength=calcStringSetLength(gNameSet, s);
- s+=factorLength+1;
- if(factorLength>maxFactorLength) {
- maxFactorLength=factorLength;
- }
- }
- length+=maxFactorLength;
- }
- if(length>maxNameLength) {
- maxNameLength=length;
- }
- break;
- }
- default:
- /* unknown type */
- break;
- }
- range = reinterpret_cast<AlgorithmicRange*>(reinterpret_cast<uint8_t*>(range) + range->size);
- --rangeCount;
- }
- return maxNameLength;
- }
- static int32_t
- calcExtNameSetsLengths(int32_t maxNameLength) {
- int32_t i, length;
- for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
- /*
- * for each category, count the length of the category name
- * plus 9=
- * 2 for <>
- * 1 for -
- * 6 for most hex digits per code point
- */
- length=9+calcStringSetLength(gNameSet, charCatNames[i]);
- if(length>maxNameLength) {
- maxNameLength=length;
- }
- }
- return maxNameLength;
- }
- static int32_t
- calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
- uint32_t set[8],
- const uint8_t **pLine, const uint8_t *lineLimit) {
- const uint8_t *line=*pLine;
- int32_t length=0, tokenLength;
- uint16_t c, token;
- while (line != lineLimit && (c = *line++) != static_cast<uint8_t>(';')) {
- if(c>=tokenCount) {
- /* implicit letter */
- SET_ADD(set, c);
- ++length;
- } else {
- token=tokens[c];
- if (token == static_cast<uint16_t>(-2)) {
- /* this is a lead byte for a double-byte token */
- c=c<<8|*line++;
- token=tokens[c];
- }
- if (token == static_cast<uint16_t>(-1)) {
- /* explicit letter */
- SET_ADD(set, c);
- ++length;
- } else {
- /* count token word */
- if(tokenLengths!=nullptr) {
- /* use cached token length */
- tokenLength=tokenLengths[c];
- if(tokenLength==0) {
- tokenLength = calcStringSetLength(set, reinterpret_cast<const char*>(tokenStrings) + token);
- tokenLengths[c] = static_cast<int8_t>(tokenLength);
- }
- } else {
- tokenLength = calcStringSetLength(set, reinterpret_cast<const char*>(tokenStrings) + token);
- }
- length+=tokenLength;
- }
- }
- }
- *pLine=line;
- return length;
- }
- static void
- calcGroupNameSetsLengths(int32_t maxNameLength) {
- uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
- uint16_t* tokens = reinterpret_cast<uint16_t*>(uCharNames) + 8;
- uint16_t tokenCount=*tokens++;
- uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->tokenStringOffset;
- int8_t *tokenLengths;
- const uint16_t *group;
- const uint8_t *s, *line, *lineLimit;
- int32_t groupCount, lineNumber, length;
- tokenLengths = static_cast<int8_t*>(uprv_malloc(tokenCount));
- if(tokenLengths!=nullptr) {
- uprv_memset(tokenLengths, 0, tokenCount);
- }
- group=GET_GROUPS(uCharNames);
- groupCount=*group++;
- /* enumerate all groups */
- while(groupCount>0) {
- s = reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->groupStringOffset + GET_GROUP_OFFSET(group);
- s=expandGroupLengths(s, offsets, lengths);
- /* enumerate all lines in each group */
- for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
- line=s+offsets[lineNumber];
- length=lengths[lineNumber];
- if(length==0) {
- continue;
- }
- lineLimit=line+length;
- /* read regular name */
- length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
- if(length>maxNameLength) {
- maxNameLength=length;
- }
- if(line==lineLimit) {
- continue;
- }
- /* read Unicode 1.0 name */
- length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
- if(length>maxNameLength) {
- maxNameLength=length;
- }
- if(line==lineLimit) {
- continue;
- }
- /* read ISO comment */
- /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
- }
- group=NEXT_GROUP(group);
- --groupCount;
- }
- if(tokenLengths!=nullptr) {
- uprv_free(tokenLengths);
- }
- /* set gMax... - name length last for threading */
- gMaxNameLength=maxNameLength;
- }
- static UBool
- calcNameSetsLengths(UErrorCode *pErrorCode) {
- static const char extChars[]="0123456789ABCDEF<>-";
- int32_t i, maxNameLength;
- if(gMaxNameLength!=0) {
- return true;
- }
- if(!isDataLoaded(pErrorCode)) {
- return false;
- }
- /* set hex digits, used in various names, and <>-, used in extended names */
- for (i = 0; i < static_cast<int32_t>(sizeof(extChars)) - 1; ++i) {
- SET_ADD(gNameSet, extChars[i]);
- }
- /* set sets and lengths from algorithmic names */
- maxNameLength=calcAlgNameSetsLengths(0);
- /* set sets and lengths from extended names */
- maxNameLength=calcExtNameSetsLengths(maxNameLength);
- /* set sets and lengths from group names, set global maximum values */
- calcGroupNameSetsLengths(maxNameLength);
- return true;
- }
- U_NAMESPACE_END
- /* public API --------------------------------------------------------------- */
- U_NAMESPACE_USE
- U_CAPI int32_t U_EXPORT2
- u_charName(UChar32 code, UCharNameChoice nameChoice,
- char *buffer, int32_t bufferLength,
- UErrorCode *pErrorCode) {
- AlgorithmicRange *algRange;
- uint32_t *p;
- uint32_t i;
- int32_t length;
- /* check the argument values */
- if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
- return 0;
- } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
- bufferLength<0 || (bufferLength>0 && buffer==nullptr)
- ) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
- return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
- }
- length=0;
- /* try algorithmic names first */
- p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
- i=*p;
- algRange=(AlgorithmicRange *)(p+1);
- while(i>0) {
- if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
- length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
- break;
- }
- algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
- --i;
- }
- if(i==0) {
- if (nameChoice == U_EXTENDED_CHAR_NAME) {
- length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
- if (!length) {
- /* extended character name */
- length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
- }
- } else {
- /* normal character name */
- length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
- }
- }
- return u_terminateChars(buffer, bufferLength, length, pErrorCode);
- }
- U_CAPI int32_t U_EXPORT2
- u_getISOComment(UChar32 /*c*/,
- char *dest, int32_t destCapacity,
- UErrorCode *pErrorCode) {
- /* check the argument values */
- if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
- return 0;
- } else if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- return u_terminateChars(dest, destCapacity, 0, pErrorCode);
- }
- U_CAPI UChar32 U_EXPORT2
- u_charFromName(UCharNameChoice nameChoice,
- const char *name,
- UErrorCode *pErrorCode) {
- char upper[120] = {0};
- char lower[120] = {0};
- FindName findName;
- AlgorithmicRange *algRange;
- uint32_t *p;
- uint32_t i;
- UChar32 cp = 0;
- char c0;
- static constexpr UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
- if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
- return error;
- }
- if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==nullptr || *name==0) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return error;
- }
- if(!isDataLoaded(pErrorCode)) {
- return error;
- }
- /* construct the uppercase and lowercase of the name first */
- for(i=0; i<sizeof(upper); ++i) {
- if((c0=*name++)!=0) {
- upper[i]=uprv_toupper(c0);
- lower[i]=uprv_tolower(c0);
- } else {
- upper[i]=lower[i]=0;
- break;
- }
- }
- if(i==sizeof(upper)) {
- /* name too long, there is no such character */
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- return error;
- }
- // i==strlen(name)==strlen(lower)==strlen(upper)
- /* try extended names first */
- if (lower[0] == '<') {
- if (nameChoice == U_EXTENDED_CHAR_NAME && lower[--i] == '>') {
- // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
- uint32_t limit = i;
- while (i >= 3 && lower[--i] != '-') {}
- // There should be 1 to 8 hex digits.
- int32_t hexLength = limit - (i + 1);
- if (i >= 2 && lower[i] == '-' && 1 <= hexLength && hexLength <= 8) {
- uint32_t cIdx;
- lower[i] = 0;
- for (++i; i < limit; ++i) {
- if (lower[i] >= '0' && lower[i] <= '9') {
- cp = (cp << 4) + lower[i] - '0';
- } else if (lower[i] >= 'a' && lower[i] <= 'f') {
- cp = (cp << 4) + lower[i] - 'a' + 10;
- } else {
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- return error;
- }
- // Prevent signed-integer overflow and out-of-range code points.
- if (cp > UCHAR_MAX_VALUE) {
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- return error;
- }
- }
- /* Now validate the category name.
- We could use a binary search, or a trie, if
- we really wanted to. */
- uint8_t cat = getCharCat(cp);
- for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
- if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
- if (cat == cIdx) {
- return cp;
- }
- break;
- }
- }
- }
- }
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- return error;
- }
- /* try algorithmic names now */
- p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
- i=*p;
- algRange=(AlgorithmicRange *)(p+1);
- while(i>0) {
- if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
- return cp;
- }
- algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
- --i;
- }
- /* normal character name */
- findName.otherName=upper;
- findName.code=error;
- enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
- if (findName.code == error) {
- *pErrorCode = U_ILLEGAL_CHAR_FOUND;
- }
- return findName.code;
- }
- U_CAPI void U_EXPORT2
- u_enumCharNames(UChar32 start, UChar32 limit,
- UEnumCharNamesFn *fn,
- void *context,
- UCharNameChoice nameChoice,
- UErrorCode *pErrorCode) {
- AlgorithmicRange *algRange;
- uint32_t *p;
- uint32_t i;
- if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
- return;
- }
- if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==nullptr) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
- limit = UCHAR_MAX_VALUE + 1;
- }
- if((uint32_t)start>=(uint32_t)limit) {
- return;
- }
- if(!isDataLoaded(pErrorCode)) {
- return;
- }
- /* interleave the data-driven ones with the algorithmic ones */
- /* iterate over all algorithmic ranges; assume that they are in ascending order */
- p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
- i=*p;
- algRange=(AlgorithmicRange *)(p+1);
- while(i>0) {
- /* enumerate the character names before the current algorithmic range */
- /* here: start<limit */
- if((uint32_t)start<algRange->start) {
- if((uint32_t)limit<=algRange->start) {
- enumNames(uCharNames, start, limit, fn, context, nameChoice);
- return;
- }
- if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
- return;
- }
- start=(UChar32)algRange->start;
- }
- /* enumerate the character names in the current algorithmic range */
- /* here: algRange->start<=start<limit */
- if((uint32_t)start<=algRange->end) {
- if((uint32_t)limit<=(algRange->end+1)) {
- enumAlgNames(algRange, start, limit, fn, context, nameChoice);
- return;
- }
- if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
- return;
- }
- start=(UChar32)algRange->end+1;
- }
- /* continue to the next algorithmic range (here: start<limit) */
- algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
- --i;
- }
- /* enumerate the character names after the last algorithmic range */
- enumNames(uCharNames, start, limit, fn, context, nameChoice);
- }
- U_CAPI int32_t U_EXPORT2
- uprv_getMaxCharNameLength() {
- UErrorCode errorCode=U_ZERO_ERROR;
- if(calcNameSetsLengths(&errorCode)) {
- return gMaxNameLength;
- } else {
- return 0;
- }
- }
- /**
- * Converts the char set cset into a Unicode set uset.
- * @param cset Set of 256 bit flags corresponding to a set of chars.
- * @param uset USet to receive characters. Existing contents are deleted.
- */
- static void
- charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
- char16_t us[256];
- char cs[256];
- int32_t i, length;
- UErrorCode errorCode;
- errorCode=U_ZERO_ERROR;
- if(!calcNameSetsLengths(&errorCode)) {
- return;
- }
- /* build a char string with all chars that are used in character names */
- length=0;
- for(i=0; i<256; ++i) {
- if(SET_CONTAINS(cset, i)) {
- cs[length++] = static_cast<char>(i);
- }
- }
- /* convert the char string to a char16_t string */
- u_charsToUChars(cs, us, length);
- /* add each char16_t to the USet */
- for(i=0; i<length; ++i) {
- if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (char16_t)0 */
- sa->add(sa->set, us[i]);
- }
- }
- }
- /**
- * Fills set with characters that are used in Unicode character names.
- * @param set USet to receive characters.
- */
- U_CAPI void U_EXPORT2
- uprv_getCharNameCharacters(const USetAdder *sa) {
- charSetToUSet(gNameSet, sa);
- }
- /* data swapping ------------------------------------------------------------ */
- /*
- * The token table contains non-negative entries for token bytes,
- * and -1 for bytes that represent themselves in the data file's charset.
- * -2 entries are used for lead bytes.
- *
- * Direct bytes (-1 entries) must be translated from the input charset family
- * to the output charset family.
- * makeTokenMap() writes a permutation mapping for this.
- * Use it once for single-/lead-byte tokens and once more for all trail byte
- * tokens. (';' is an unused trail byte marked with -1.)
- */
- static void
- makeTokenMap(const UDataSwapper *ds,
- int16_t tokens[], uint16_t tokenCount,
- uint8_t map[256],
- UErrorCode *pErrorCode) {
- UBool usedOutChar[256];
- uint16_t i, j;
- uint8_t c1, c2;
- if(U_FAILURE(*pErrorCode)) {
- return;
- }
- if(ds->inCharset==ds->outCharset) {
- /* Same charset family: identity permutation */
- for(i=0; i<256; ++i) {
- map[i] = static_cast<uint8_t>(i);
- }
- } else {
- uprv_memset(map, 0, 256);
- uprv_memset(usedOutChar, 0, 256);
- if(tokenCount>256) {
- tokenCount=256;
- }
- /* set the direct bytes (byte 0 always maps to itself) */
- for(i=1; i<tokenCount; ++i) {
- if(tokens[i]==-1) {
- /* convert the direct byte character */
- c1 = static_cast<uint8_t>(i);
- ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
- i, ds->inCharset);
- return;
- }
- /* enter the converted character into the map and mark it used */
- map[c1]=c2;
- usedOutChar[c2]=true;
- }
- }
- /* set the mappings for the rest of the permutation */
- for(i=j=1; i<tokenCount; ++i) {
- /* set mappings that were not set for direct bytes */
- if(map[i]==0) {
- /* set an output byte value that was not used as an output byte above */
- while(usedOutChar[j]) {
- ++j;
- }
- map[i] = static_cast<uint8_t>(j++);
- }
- }
- /*
- * leave mappings at tokenCount and above unset if tokenCount<256
- * because they won't be used
- */
- }
- }
- U_CAPI int32_t U_EXPORT2
- uchar_swapNames(const UDataSwapper *ds,
- const void *inData, int32_t length, void *outData,
- UErrorCode *pErrorCode) {
- const UDataInfo *pInfo;
- int32_t headerSize;
- const uint8_t *inBytes;
- uint8_t *outBytes;
- uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
- offset, i, count, stringsCount;
- const AlgorithmicRange *inRange;
- AlgorithmicRange *outRange;
- /* udata_swapDataHeader checks the arguments */
- headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
- if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
- return 0;
- }
- /* check data format and format version */
- pInfo=(const UDataInfo *)((const char *)inData+4);
- if(!(
- pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
- pInfo->dataFormat[1]==0x6e &&
- pInfo->dataFormat[2]==0x61 &&
- pInfo->dataFormat[3]==0x6d &&
- pInfo->formatVersion[0]==1
- )) {
- udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
- pInfo->dataFormat[0], pInfo->dataFormat[1],
- pInfo->dataFormat[2], pInfo->dataFormat[3],
- pInfo->formatVersion[0]);
- *pErrorCode=U_UNSUPPORTED_ERROR;
- return 0;
- }
- inBytes=(const uint8_t *)inData+headerSize;
- outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize;
- if(length<0) {
- algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
- } else {
- length-=headerSize;
- if( length<20 ||
- (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
- ) {
- udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
- length);
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- }
- if(length<0) {
- /* preflighting: iterate through algorithmic ranges */
- offset=algNamesOffset;
- count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
- offset+=4;
- for(i=0; i<count; ++i) {
- inRange=(const AlgorithmicRange *)(inBytes+offset);
- offset+=ds->readUInt16(inRange->size);
- }
- } else {
- /* swap data */
- const uint16_t *p;
- uint16_t *q, *temp;
- int16_t tokens[512];
- uint16_t tokenCount;
- uint8_t map[256], trailMap[256];
- /* copy the data for inaccessible bytes */
- if(inBytes!=outBytes) {
- uprv_memcpy(outBytes, inBytes, length);
- }
- /* the initial 4 offsets first */
- tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
- groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
- groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
- ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
- /*
- * now the tokens table
- * it needs to be permutated along with the compressed name strings
- */
- p=(const uint16_t *)(inBytes+16);
- q=(uint16_t *)(outBytes+16);
- /* read and swap the tokenCount */
- tokenCount=ds->readUInt16(*p);
- ds->swapArray16(ds, p, 2, q, pErrorCode);
- ++p;
- ++q;
- /* read the first 512 tokens and make the token maps */
- if(tokenCount<=512) {
- count=tokenCount;
- } else {
- count=512;
- }
- for(i=0; i<count; ++i) {
- tokens[i]=udata_readInt16(ds, p[i]);
- }
- for(; i<512; ++i) {
- tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
- }
- makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
- makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
- /*
- * swap and permutate the tokens
- * go through a temporary array to support in-place swapping
- */
- temp=(uint16_t *)uprv_malloc(tokenCount*2);
- if(temp==nullptr) {
- udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
- tokenCount);
- *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- /* swap and permutate single-/lead-byte tokens */
- for(i=0; i<tokenCount && i<256; ++i) {
- ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
- }
- /* swap and permutate trail-byte tokens */
- for(; i<tokenCount; ++i) {
- ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
- }
- /* copy the result into the output and free the temporary array */
- uprv_memcpy(q, temp, tokenCount*2);
- uprv_free(temp);
- /*
- * swap the token strings but not a possible padding byte after
- * the terminating NUL of the last string
- */
- udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
- outBytes+tokenStringOffset, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- udata_printError(ds, "uchar_swapNames(token strings) failed\n");
- return 0;
- }
- /* swap the group table */
- count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
- ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
- outBytes+groupsOffset, pErrorCode);
- /*
- * swap the group strings
- * swap the string bytes but not the nibble-encoded string lengths
- */
- if(ds->inCharset!=ds->outCharset) {
- uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
- const uint8_t *inStrings, *nextInStrings;
- uint8_t *outStrings;
- uint8_t c;
- inStrings=inBytes+groupStringOffset;
- outStrings=outBytes+groupStringOffset;
- stringsCount=algNamesOffset-groupStringOffset;
- /* iterate through string groups until only a few padding bytes are left */
- while(stringsCount>32) {
- nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
- /* move past the length bytes */
- stringsCount-=(uint32_t)(nextInStrings-inStrings);
- outStrings+=nextInStrings-inStrings;
- inStrings=nextInStrings;
- count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
- stringsCount-=count;
- /* swap the string bytes using map[] and trailMap[] */
- while(count>0) {
- c=*inStrings++;
- *outStrings++=map[c];
- if(tokens[c]!=-2) {
- --count;
- } else {
- /* token lead byte: swap the trail byte, too */
- *outStrings++=trailMap[*inStrings++];
- count-=2;
- }
- }
- }
- }
- /* swap the algorithmic ranges */
- offset=algNamesOffset;
- count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
- ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
- offset+=4;
- for(i=0; i<count; ++i) {
- if(offset>(uint32_t)length) {
- udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
- length, i);
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- inRange=(const AlgorithmicRange *)(inBytes+offset);
- outRange=(AlgorithmicRange *)(outBytes+offset);
- offset+=ds->readUInt16(inRange->size);
- ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
- ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
- switch(inRange->type) {
- case 0:
- /* swap prefix string */
- ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
- outRange+1, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
- i);
- return 0;
- }
- break;
- case 1:
- {
- /* swap factors and the prefix and factor strings */
- uint32_t factorsCount;
- factorsCount=inRange->variant;
- p=(const uint16_t *)(inRange+1);
- q=(uint16_t *)(outRange+1);
- ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
- /* swap the strings, up to the last terminating NUL */
- p+=factorsCount;
- q+=factorsCount;
- stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
- while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
- --stringsCount;
- }
- ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
- }
- break;
- default:
- udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
- inRange->type, i);
- *pErrorCode=U_UNSUPPORTED_ERROR;
- return 0;
- }
- }
- }
- return headerSize+(int32_t)offset;
- }
- /*
- * Hey, Emacs, please set the following:
- *
- * Local Variables:
- * indent-tabs-mode: nil
- * End:
- *
- */
|