ucnvisci.cpp 71 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2000-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: ucnvisci.c
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2001JUN26
  14. * created by: Ram Viswanadha
  15. *
  16. * Date Name Description
  17. * 24/7/2001 Ram Added support for EXT character handling
  18. */
  19. #include "unicode/utypes.h"
  20. #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  21. #include "unicode/ucnv.h"
  22. #include "unicode/ucnv_cb.h"
  23. #include "unicode/utf16.h"
  24. #include "cmemory.h"
  25. #include "ucnv_bld.h"
  26. #include "ucnv_cnv.h"
  27. #include "cstring.h"
  28. #include "uassert.h"
  29. #define UCNV_OPTIONS_VERSION_MASK 0xf
  30. #define NUKTA 0x093c
  31. #define HALANT 0x094d
  32. #define ZWNJ 0x200c /* Zero Width Non Joiner */
  33. #define ZWJ 0x200d /* Zero width Joiner */
  34. #define INVALID_CHAR 0xffff
  35. #define ATR 0xEF /* Attribute code */
  36. #define EXT 0xF0 /* Extension code */
  37. #define DANDA 0x0964
  38. #define DOUBLE_DANDA 0x0965
  39. #define ISCII_NUKTA 0xE9
  40. #define ISCII_HALANT 0xE8
  41. #define ISCII_DANDA 0xEA
  42. #define ISCII_INV 0xD9
  43. #define ISCII_VOWEL_SIGN_E 0xE0
  44. #define INDIC_BLOCK_BEGIN 0x0900
  45. #define INDIC_BLOCK_END 0x0D7F
  46. #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN)
  47. #define VOCALLIC_RR 0x0931
  48. #define LF 0x0A
  49. #define ASCII_END 0xA0
  50. #define NO_CHAR_MARKER 0xFFFE
  51. #define TELUGU_DELTA DELTA * TELUGU
  52. #define DEV_ABBR_SIGN 0x0970
  53. #define DEV_ANUDATTA 0x0952
  54. #define EXT_RANGE_BEGIN 0xA1
  55. #define EXT_RANGE_END 0xEE
  56. #define PNJ_DELTA 0x0100
  57. #define PNJ_BINDI 0x0A02
  58. #define PNJ_TIPPI 0x0A70
  59. #define PNJ_SIGN_VIRAMA 0x0A4D
  60. #define PNJ_ADHAK 0x0A71
  61. #define PNJ_HA 0x0A39
  62. #define PNJ_RRA 0x0A5C
  63. typedef enum {
  64. DEVANAGARI =0,
  65. BENGALI,
  66. GURMUKHI,
  67. GUJARATI,
  68. ORIYA,
  69. TAMIL,
  70. TELUGU,
  71. KANNADA,
  72. MALAYALAM,
  73. DELTA=0x80
  74. }UniLang;
  75. /**
  76. * Enumeration for switching code pages if <ATR>+<one of below values>
  77. * is encountered
  78. */
  79. typedef enum {
  80. DEF = 0x40,
  81. RMN = 0x41,
  82. DEV = 0x42,
  83. BNG = 0x43,
  84. TML = 0x44,
  85. TLG = 0x45,
  86. ASM = 0x46,
  87. ORI = 0x47,
  88. KND = 0x48,
  89. MLM = 0x49,
  90. GJR = 0x4A,
  91. PNJ = 0x4B,
  92. ARB = 0x71,
  93. PES = 0x72,
  94. URD = 0x73,
  95. SND = 0x74,
  96. KSM = 0x75,
  97. PST = 0x76
  98. }ISCIILang;
  99. typedef enum {
  100. DEV_MASK =0x80,
  101. PNJ_MASK =0x40,
  102. GJR_MASK =0x20,
  103. ORI_MASK =0x10,
  104. BNG_MASK =0x08,
  105. KND_MASK =0x04,
  106. MLM_MASK =0x02,
  107. TML_MASK =0x01,
  108. ZERO =0x00
  109. }MaskEnum;
  110. #define ISCII_CNV_PREFIX "ISCII,version="
  111. typedef struct {
  112. char16_t contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */
  113. char16_t contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */
  114. uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */
  115. uint16_t currentDeltaFromUnicode; /* current delta in Indic block */
  116. uint16_t currentDeltaToUnicode; /* current delta in Indic block */
  117. MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */
  118. MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */
  119. MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */
  120. UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */
  121. UBool resetToDefaultToUnicode; /* boolean for resetting to default delta and mask when a newline is encountered*/
  122. char name[sizeof(ISCII_CNV_PREFIX) + 1];
  123. UChar32 prevToUnicodeStatus; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */
  124. } UConverterDataISCII;
  125. typedef struct LookupDataStruct {
  126. UniLang uniLang;
  127. MaskEnum maskEnum;
  128. ISCIILang isciiLang;
  129. } LookupDataStruct;
  130. static const LookupDataStruct lookupInitialData[]={
  131. { DEVANAGARI, DEV_MASK, DEV },
  132. { BENGALI, BNG_MASK, BNG },
  133. { GURMUKHI, PNJ_MASK, PNJ },
  134. { GUJARATI, GJR_MASK, GJR },
  135. { ORIYA, ORI_MASK, ORI },
  136. { TAMIL, TML_MASK, TML },
  137. { TELUGU, KND_MASK, TLG },
  138. { KANNADA, KND_MASK, KND },
  139. { MALAYALAM, MLM_MASK, MLM }
  140. };
  141. /*
  142. * For special handling of certain Gurmukhi characters.
  143. * Bit 0 (value 1): PNJ consonant
  144. * Bit 1 (value 2): PNJ Bindi Tippi
  145. */
  146. static const uint8_t pnjMap[80] = {
  147. /* 0A00..0A0F */
  148. 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
  149. /* 0A10..0A1F */
  150. 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  151. /* 0A20..0A2F */
  152. 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3,
  153. /* 0A30..0A3F */
  154. 3, 0, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2,
  155. /* 0A40..0A4F */
  156. 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  157. };
  158. static UBool
  159. isPNJConsonant(UChar32 c) {
  160. if (c < 0xa00 || 0xa50 <= c) {
  161. return false;
  162. } else {
  163. return pnjMap[c - 0xa00] & 1;
  164. }
  165. }
  166. static UBool
  167. isPNJBindiTippi(UChar32 c) {
  168. if (c < 0xa00 || 0xa50 <= c) {
  169. return false;
  170. } else {
  171. return pnjMap[c - 0xa00] >> 1;
  172. }
  173. }
  174. U_CDECL_BEGIN
  175. static void U_CALLCONV
  176. _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode) {
  177. if(pArgs->onlyTestIsLoadable) {
  178. return;
  179. }
  180. cnv->extraInfo = uprv_malloc(sizeof(UConverterDataISCII));
  181. if (cnv->extraInfo != nullptr) {
  182. int32_t len=0;
  183. UConverterDataISCII *converterData=
  184. (UConverterDataISCII *) cnv->extraInfo;
  185. converterData->contextCharToUnicode=NO_CHAR_MARKER;
  186. cnv->toUnicodeStatus = missingCharMarker;
  187. converterData->contextCharFromUnicode=0x0000;
  188. converterData->resetToDefaultToUnicode=false;
  189. /* check if the version requested is supported */
  190. if ((pArgs->options & UCNV_OPTIONS_VERSION_MASK) < 9) {
  191. /* initialize state variables */
  192. converterData->currentDeltaFromUnicode
  193. = converterData->currentDeltaToUnicode
  194. = converterData->defDeltaToUnicode = (uint16_t)(lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].uniLang * DELTA);
  195. converterData->currentMaskFromUnicode
  196. = converterData->currentMaskToUnicode
  197. = converterData->defMaskToUnicode = lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].maskEnum;
  198. converterData->isFirstBuffer=true;
  199. (void)uprv_strcpy(converterData->name, ISCII_CNV_PREFIX);
  200. len = (int32_t)uprv_strlen(converterData->name);
  201. converterData->name[len]= (char)((pArgs->options & UCNV_OPTIONS_VERSION_MASK) + '0');
  202. converterData->name[len+1]=0;
  203. converterData->prevToUnicodeStatus = 0x0000;
  204. } else {
  205. uprv_free(cnv->extraInfo);
  206. cnv->extraInfo = nullptr;
  207. *errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  208. }
  209. } else {
  210. *errorCode =U_MEMORY_ALLOCATION_ERROR;
  211. }
  212. }
  213. static void U_CALLCONV
  214. _ISCIIClose(UConverter *cnv) {
  215. if (cnv->extraInfo!=nullptr) {
  216. if (!cnv->isExtraLocal) {
  217. uprv_free(cnv->extraInfo);
  218. }
  219. cnv->extraInfo=nullptr;
  220. }
  221. }
  222. static const char* U_CALLCONV
  223. _ISCIIgetName(const UConverter* cnv) {
  224. if (cnv->extraInfo) {
  225. UConverterDataISCII* myData= (UConverterDataISCII*)cnv->extraInfo;
  226. return myData->name;
  227. }
  228. return nullptr;
  229. }
  230. static void U_CALLCONV
  231. _ISCIIReset(UConverter *cnv, UConverterResetChoice choice) {
  232. UConverterDataISCII* data =(UConverterDataISCII *) (cnv->extraInfo);
  233. if (choice<=UCNV_RESET_TO_UNICODE) {
  234. cnv->toUnicodeStatus = missingCharMarker;
  235. cnv->mode=0;
  236. data->currentDeltaToUnicode=data->defDeltaToUnicode;
  237. data->currentMaskToUnicode = data->defMaskToUnicode;
  238. data->contextCharToUnicode=NO_CHAR_MARKER;
  239. data->prevToUnicodeStatus = 0x0000;
  240. }
  241. if (choice!=UCNV_RESET_TO_UNICODE) {
  242. cnv->fromUChar32=0x0000;
  243. data->contextCharFromUnicode=0x00;
  244. data->currentMaskFromUnicode=data->defMaskToUnicode;
  245. data->currentDeltaFromUnicode=data->defDeltaToUnicode;
  246. data->isFirstBuffer=true;
  247. data->resetToDefaultToUnicode=false;
  248. }
  249. }
  250. /**
  251. * The values in validity table are indexed by the lower bits of Unicode
  252. * range 0x0900 - 0x09ff. The values have a structure like:
  253. * ---------------------------------------------------------------
  254. * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML |
  255. * | | | | | ASM | KND | | |
  256. * ---------------------------------------------------------------
  257. * If a code point is valid in a particular script
  258. * then that bit is turned on
  259. *
  260. * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for
  261. * to represent these languages
  262. *
  263. * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case
  264. * and combine and use 1 bit to represent these languages.
  265. *
  266. * TODO: It is probably easier to understand and maintain to change this
  267. * to use uint16_t and give each of the 9 Unicode/script blocks its own bit.
  268. */
  269. static const uint8_t validityTable[128] = {
  270. /* This state table is tool generated please do not edit unless you know exactly what you are doing */
  271. /* Note: This table was edited to mirror the Windows XP implementation */
  272. /*ISCII:Valid:Unicode */
  273. /*0xa0 : 0x00: 0x900 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  274. /*0xa1 : 0xb8: 0x901 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
  275. /*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  276. /*0xa3 : 0xbf: 0x903 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  277. /*0x00 : 0x00: 0x904 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  278. /*0xa4 : 0xff: 0x905 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  279. /*0xa5 : 0xff: 0x906 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  280. /*0xa6 : 0xff: 0x907 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  281. /*0xa7 : 0xff: 0x908 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  282. /*0xa8 : 0xff: 0x909 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  283. /*0xa9 : 0xff: 0x90a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  284. /*0xaa : 0xfe: 0x90b */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  285. /*0x00 : 0x00: 0x90c */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  286. /*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
  287. /*0xab : 0x87: 0x90e */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
  288. /*0xac : 0xff: 0x90f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  289. /*0xad : 0xff: 0x910 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  290. /*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
  291. /*0xaf : 0x87: 0x912 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
  292. /*0xb0 : 0xff: 0x913 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  293. /*0xb1 : 0xff: 0x914 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  294. /*0xb3 : 0xff: 0x915 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  295. /*0xb4 : 0xfe: 0x916 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  296. /*0xb5 : 0xfe: 0x917 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  297. /*0xb6 : 0xfe: 0x918 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  298. /*0xb7 : 0xff: 0x919 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  299. /*0xb8 : 0xff: 0x91a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  300. /*0xb9 : 0xfe: 0x91b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  301. /*0xba : 0xff: 0x91c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  302. /*0xbb : 0xfe: 0x91d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  303. /*0xbc : 0xff: 0x91e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  304. /*0xbd : 0xff: 0x91f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  305. /*0xbe : 0xfe: 0x920 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  306. /*0xbf : 0xfe: 0x921 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  307. /*0xc0 : 0xfe: 0x922 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  308. /*0xc1 : 0xff: 0x923 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  309. /*0xc2 : 0xff: 0x924 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  310. /*0xc3 : 0xfe: 0x925 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  311. /*0xc4 : 0xfe: 0x926 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  312. /*0xc5 : 0xfe: 0x927 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  313. /*0xc6 : 0xff: 0x928 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  314. /*0xc7 : 0x81: 0x929 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + TML_MASK ,
  315. /*0xc8 : 0xff: 0x92a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  316. /*0xc9 : 0xfe: 0x92b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  317. /*0xca : 0xfe: 0x92c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  318. /*0xcb : 0xfe: 0x92d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  319. /*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  320. /*0xcd : 0xff: 0x92f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  321. /*0xcf : 0xff: 0x930 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  322. /*0xd0 : 0x87: 0x931 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK ,
  323. /*0xd1 : 0xff: 0x932 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  324. /*0xd2 : 0xb7: 0x933 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
  325. /*0xd3 : 0x83: 0x934 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK ,
  326. /*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
  327. /*0xd5 : 0xfe: 0x936 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  328. /*0xd6 : 0xbf: 0x937 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  329. /*0xd7 : 0xff: 0x938 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  330. /*0xd8 : 0xff: 0x939 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  331. /*0x00 : 0x00: 0x93A */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  332. /*0x00 : 0x00: 0x93B */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  333. /*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
  334. /*0x00 : 0x00: 0x93d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  335. /*0xda : 0xff: 0x93e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  336. /*0xdb : 0xff: 0x93f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  337. /*0xdc : 0xff: 0x940 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  338. /*0xdd : 0xff: 0x941 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  339. /*0xde : 0xff: 0x942 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  340. /*0xdf : 0xbe: 0x943 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  341. /*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + GJR_MASK + ZERO + BNG_MASK + KND_MASK + ZERO + ZERO ,
  342. /*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
  343. /*0xe0 : 0x87: 0x946 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
  344. /*0xe1 : 0xff: 0x947 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  345. /*0xe2 : 0xff: 0x948 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  346. /*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
  347. /*0xe4 : 0x87: 0x94a */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
  348. /*0xe5 : 0xff: 0x94b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  349. /*0xe6 : 0xff: 0x94c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  350. /*0xe8 : 0xff: 0x94d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  351. /*0xec : 0x00: 0x94e */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  352. /*0xed : 0x00: 0x94f */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  353. /*0x00 : 0x00: 0x950 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
  354. /*0x00 : 0x00: 0x951 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  355. /*0x00 : 0x00: 0x952 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  356. /*0x00 : 0x00: 0x953 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  357. /*0x00 : 0x00: 0x954 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  358. /*0x00 : 0x00: 0x955 */ ZERO + ZERO + ZERO + ZERO + ZERO + KND_MASK + ZERO + ZERO ,
  359. /*0x00 : 0x00: 0x956 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + KND_MASK + ZERO + ZERO ,
  360. /*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO ,
  361. /*0x00 : 0x00: 0x958 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  362. /*0x00 : 0x00: 0x959 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  363. /*0x00 : 0x00: 0x95a */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  364. /*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  365. /*0x00 : 0x00: 0x95c */ DEV_MASK + PNJ_MASK + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
  366. /*0x00 : 0x00: 0x95d */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
  367. /*0x00 : 0x00: 0x95e */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  368. /*0xce : 0x98: 0x95f */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
  369. /*0x00 : 0x00: 0x960 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  370. /*0x00 : 0x00: 0x961 */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
  371. /*0x00 : 0x00: 0x962 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
  372. /*0x00 : 0x00: 0x963 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
  373. /*0xea : 0xf8: 0x964 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  374. /*0xeaea : 0x00: 0x965*/ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  375. /*0xf1 : 0xff: 0x966 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  376. /*0xf2 : 0xff: 0x967 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  377. /*0xf3 : 0xff: 0x968 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  378. /*0xf4 : 0xff: 0x969 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  379. /*0xf5 : 0xff: 0x96a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  380. /*0xf6 : 0xff: 0x96b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  381. /*0xf7 : 0xff: 0x96c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  382. /*0xf8 : 0xff: 0x96d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  383. /*0xf9 : 0xff: 0x96e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  384. /*0xfa : 0xff: 0x96f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
  385. /*0x00 : 0x80: 0x970 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
  386. /*
  387. * The length of the array is 128 to provide values for 0x900..0x97f.
  388. * The last 15 entries for 0x971..0x97f of the validity table are all zero
  389. * because no Indic script uses such Unicode code points.
  390. */
  391. /*0x00 : 0x00: 0x9yz */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO
  392. };
  393. static const uint16_t fromUnicodeTable[128]={
  394. 0x00a0 ,/* 0x0900 */
  395. 0x00a1 ,/* 0x0901 */
  396. 0x00a2 ,/* 0x0902 */
  397. 0x00a3 ,/* 0x0903 */
  398. 0xa4e0 ,/* 0x0904 */
  399. 0x00a4 ,/* 0x0905 */
  400. 0x00a5 ,/* 0x0906 */
  401. 0x00a6 ,/* 0x0907 */
  402. 0x00a7 ,/* 0x0908 */
  403. 0x00a8 ,/* 0x0909 */
  404. 0x00a9 ,/* 0x090a */
  405. 0x00aa ,/* 0x090b */
  406. 0xA6E9 ,/* 0x090c */
  407. 0x00ae ,/* 0x090d */
  408. 0x00ab ,/* 0x090e */
  409. 0x00ac ,/* 0x090f */
  410. 0x00ad ,/* 0x0910 */
  411. 0x00b2 ,/* 0x0911 */
  412. 0x00af ,/* 0x0912 */
  413. 0x00b0 ,/* 0x0913 */
  414. 0x00b1 ,/* 0x0914 */
  415. 0x00b3 ,/* 0x0915 */
  416. 0x00b4 ,/* 0x0916 */
  417. 0x00b5 ,/* 0x0917 */
  418. 0x00b6 ,/* 0x0918 */
  419. 0x00b7 ,/* 0x0919 */
  420. 0x00b8 ,/* 0x091a */
  421. 0x00b9 ,/* 0x091b */
  422. 0x00ba ,/* 0x091c */
  423. 0x00bb ,/* 0x091d */
  424. 0x00bc ,/* 0x091e */
  425. 0x00bd ,/* 0x091f */
  426. 0x00be ,/* 0x0920 */
  427. 0x00bf ,/* 0x0921 */
  428. 0x00c0 ,/* 0x0922 */
  429. 0x00c1 ,/* 0x0923 */
  430. 0x00c2 ,/* 0x0924 */
  431. 0x00c3 ,/* 0x0925 */
  432. 0x00c4 ,/* 0x0926 */
  433. 0x00c5 ,/* 0x0927 */
  434. 0x00c6 ,/* 0x0928 */
  435. 0x00c7 ,/* 0x0929 */
  436. 0x00c8 ,/* 0x092a */
  437. 0x00c9 ,/* 0x092b */
  438. 0x00ca ,/* 0x092c */
  439. 0x00cb ,/* 0x092d */
  440. 0x00cc ,/* 0x092e */
  441. 0x00cd ,/* 0x092f */
  442. 0x00cf ,/* 0x0930 */
  443. 0x00d0 ,/* 0x0931 */
  444. 0x00d1 ,/* 0x0932 */
  445. 0x00d2 ,/* 0x0933 */
  446. 0x00d3 ,/* 0x0934 */
  447. 0x00d4 ,/* 0x0935 */
  448. 0x00d5 ,/* 0x0936 */
  449. 0x00d6 ,/* 0x0937 */
  450. 0x00d7 ,/* 0x0938 */
  451. 0x00d8 ,/* 0x0939 */
  452. 0xFFFF ,/* 0x093A */
  453. 0xFFFF ,/* 0x093B */
  454. 0x00e9 ,/* 0x093c */
  455. 0xEAE9 ,/* 0x093d */
  456. 0x00da ,/* 0x093e */
  457. 0x00db ,/* 0x093f */
  458. 0x00dc ,/* 0x0940 */
  459. 0x00dd ,/* 0x0941 */
  460. 0x00de ,/* 0x0942 */
  461. 0x00df ,/* 0x0943 */
  462. 0xDFE9 ,/* 0x0944 */
  463. 0x00e3 ,/* 0x0945 */
  464. 0x00e0 ,/* 0x0946 */
  465. 0x00e1 ,/* 0x0947 */
  466. 0x00e2 ,/* 0x0948 */
  467. 0x00e7 ,/* 0x0949 */
  468. 0x00e4 ,/* 0x094a */
  469. 0x00e5 ,/* 0x094b */
  470. 0x00e6 ,/* 0x094c */
  471. 0x00e8 ,/* 0x094d */
  472. 0x00ec ,/* 0x094e */
  473. 0x00ed ,/* 0x094f */
  474. 0xA1E9 ,/* 0x0950 */ /* OM Symbol */
  475. 0xFFFF ,/* 0x0951 */
  476. 0xF0B8 ,/* 0x0952 */
  477. 0xFFFF ,/* 0x0953 */
  478. 0xFFFF ,/* 0x0954 */
  479. 0xFFFF ,/* 0x0955 */
  480. 0xFFFF ,/* 0x0956 */
  481. 0xFFFF ,/* 0x0957 */
  482. 0xb3e9 ,/* 0x0958 */
  483. 0xb4e9 ,/* 0x0959 */
  484. 0xb5e9 ,/* 0x095a */
  485. 0xbae9 ,/* 0x095b */
  486. 0xbfe9 ,/* 0x095c */
  487. 0xC0E9 ,/* 0x095d */
  488. 0xc9e9 ,/* 0x095e */
  489. 0x00ce ,/* 0x095f */
  490. 0xAAe9 ,/* 0x0960 */
  491. 0xA7E9 ,/* 0x0961 */
  492. 0xDBE9 ,/* 0x0962 */
  493. 0xDCE9 ,/* 0x0963 */
  494. 0x00ea ,/* 0x0964 */
  495. 0xeaea ,/* 0x0965 */
  496. 0x00f1 ,/* 0x0966 */
  497. 0x00f2 ,/* 0x0967 */
  498. 0x00f3 ,/* 0x0968 */
  499. 0x00f4 ,/* 0x0969 */
  500. 0x00f5 ,/* 0x096a */
  501. 0x00f6 ,/* 0x096b */
  502. 0x00f7 ,/* 0x096c */
  503. 0x00f8 ,/* 0x096d */
  504. 0x00f9 ,/* 0x096e */
  505. 0x00fa ,/* 0x096f */
  506. 0xF0BF ,/* 0x0970 */
  507. 0xFFFF ,/* 0x0971 */
  508. 0xFFFF ,/* 0x0972 */
  509. 0xFFFF ,/* 0x0973 */
  510. 0xFFFF ,/* 0x0974 */
  511. 0xFFFF ,/* 0x0975 */
  512. 0xFFFF ,/* 0x0976 */
  513. 0xFFFF ,/* 0x0977 */
  514. 0xFFFF ,/* 0x0978 */
  515. 0xFFFF ,/* 0x0979 */
  516. 0xFFFF ,/* 0x097a */
  517. 0xFFFF ,/* 0x097b */
  518. 0xFFFF ,/* 0x097c */
  519. 0xFFFF ,/* 0x097d */
  520. 0xFFFF ,/* 0x097e */
  521. 0xFFFF ,/* 0x097f */
  522. };
  523. static const uint16_t toUnicodeTable[256]={
  524. 0x0000,/* 0x00 */
  525. 0x0001,/* 0x01 */
  526. 0x0002,/* 0x02 */
  527. 0x0003,/* 0x03 */
  528. 0x0004,/* 0x04 */
  529. 0x0005,/* 0x05 */
  530. 0x0006,/* 0x06 */
  531. 0x0007,/* 0x07 */
  532. 0x0008,/* 0x08 */
  533. 0x0009,/* 0x09 */
  534. 0x000a,/* 0x0a */
  535. 0x000b,/* 0x0b */
  536. 0x000c,/* 0x0c */
  537. 0x000d,/* 0x0d */
  538. 0x000e,/* 0x0e */
  539. 0x000f,/* 0x0f */
  540. 0x0010,/* 0x10 */
  541. 0x0011,/* 0x11 */
  542. 0x0012,/* 0x12 */
  543. 0x0013,/* 0x13 */
  544. 0x0014,/* 0x14 */
  545. 0x0015,/* 0x15 */
  546. 0x0016,/* 0x16 */
  547. 0x0017,/* 0x17 */
  548. 0x0018,/* 0x18 */
  549. 0x0019,/* 0x19 */
  550. 0x001a,/* 0x1a */
  551. 0x001b,/* 0x1b */
  552. 0x001c,/* 0x1c */
  553. 0x001d,/* 0x1d */
  554. 0x001e,/* 0x1e */
  555. 0x001f,/* 0x1f */
  556. 0x0020,/* 0x20 */
  557. 0x0021,/* 0x21 */
  558. 0x0022,/* 0x22 */
  559. 0x0023,/* 0x23 */
  560. 0x0024,/* 0x24 */
  561. 0x0025,/* 0x25 */
  562. 0x0026,/* 0x26 */
  563. 0x0027,/* 0x27 */
  564. 0x0028,/* 0x28 */
  565. 0x0029,/* 0x29 */
  566. 0x002a,/* 0x2a */
  567. 0x002b,/* 0x2b */
  568. 0x002c,/* 0x2c */
  569. 0x002d,/* 0x2d */
  570. 0x002e,/* 0x2e */
  571. 0x002f,/* 0x2f */
  572. 0x0030,/* 0x30 */
  573. 0x0031,/* 0x31 */
  574. 0x0032,/* 0x32 */
  575. 0x0033,/* 0x33 */
  576. 0x0034,/* 0x34 */
  577. 0x0035,/* 0x35 */
  578. 0x0036,/* 0x36 */
  579. 0x0037,/* 0x37 */
  580. 0x0038,/* 0x38 */
  581. 0x0039,/* 0x39 */
  582. 0x003A,/* 0x3A */
  583. 0x003B,/* 0x3B */
  584. 0x003c,/* 0x3c */
  585. 0x003d,/* 0x3d */
  586. 0x003e,/* 0x3e */
  587. 0x003f,/* 0x3f */
  588. 0x0040,/* 0x40 */
  589. 0x0041,/* 0x41 */
  590. 0x0042,/* 0x42 */
  591. 0x0043,/* 0x43 */
  592. 0x0044,/* 0x44 */
  593. 0x0045,/* 0x45 */
  594. 0x0046,/* 0x46 */
  595. 0x0047,/* 0x47 */
  596. 0x0048,/* 0x48 */
  597. 0x0049,/* 0x49 */
  598. 0x004a,/* 0x4a */
  599. 0x004b,/* 0x4b */
  600. 0x004c,/* 0x4c */
  601. 0x004d,/* 0x4d */
  602. 0x004e,/* 0x4e */
  603. 0x004f,/* 0x4f */
  604. 0x0050,/* 0x50 */
  605. 0x0051,/* 0x51 */
  606. 0x0052,/* 0x52 */
  607. 0x0053,/* 0x53 */
  608. 0x0054,/* 0x54 */
  609. 0x0055,/* 0x55 */
  610. 0x0056,/* 0x56 */
  611. 0x0057,/* 0x57 */
  612. 0x0058,/* 0x58 */
  613. 0x0059,/* 0x59 */
  614. 0x005a,/* 0x5a */
  615. 0x005b,/* 0x5b */
  616. 0x005c,/* 0x5c */
  617. 0x005d,/* 0x5d */
  618. 0x005e,/* 0x5e */
  619. 0x005f,/* 0x5f */
  620. 0x0060,/* 0x60 */
  621. 0x0061,/* 0x61 */
  622. 0x0062,/* 0x62 */
  623. 0x0063,/* 0x63 */
  624. 0x0064,/* 0x64 */
  625. 0x0065,/* 0x65 */
  626. 0x0066,/* 0x66 */
  627. 0x0067,/* 0x67 */
  628. 0x0068,/* 0x68 */
  629. 0x0069,/* 0x69 */
  630. 0x006a,/* 0x6a */
  631. 0x006b,/* 0x6b */
  632. 0x006c,/* 0x6c */
  633. 0x006d,/* 0x6d */
  634. 0x006e,/* 0x6e */
  635. 0x006f,/* 0x6f */
  636. 0x0070,/* 0x70 */
  637. 0x0071,/* 0x71 */
  638. 0x0072,/* 0x72 */
  639. 0x0073,/* 0x73 */
  640. 0x0074,/* 0x74 */
  641. 0x0075,/* 0x75 */
  642. 0x0076,/* 0x76 */
  643. 0x0077,/* 0x77 */
  644. 0x0078,/* 0x78 */
  645. 0x0079,/* 0x79 */
  646. 0x007a,/* 0x7a */
  647. 0x007b,/* 0x7b */
  648. 0x007c,/* 0x7c */
  649. 0x007d,/* 0x7d */
  650. 0x007e,/* 0x7e */
  651. 0x007f,/* 0x7f */
  652. 0x0080,/* 0x80 */
  653. 0x0081,/* 0x81 */
  654. 0x0082,/* 0x82 */
  655. 0x0083,/* 0x83 */
  656. 0x0084,/* 0x84 */
  657. 0x0085,/* 0x85 */
  658. 0x0086,/* 0x86 */
  659. 0x0087,/* 0x87 */
  660. 0x0088,/* 0x88 */
  661. 0x0089,/* 0x89 */
  662. 0x008a,/* 0x8a */
  663. 0x008b,/* 0x8b */
  664. 0x008c,/* 0x8c */
  665. 0x008d,/* 0x8d */
  666. 0x008e,/* 0x8e */
  667. 0x008f,/* 0x8f */
  668. 0x0090,/* 0x90 */
  669. 0x0091,/* 0x91 */
  670. 0x0092,/* 0x92 */
  671. 0x0093,/* 0x93 */
  672. 0x0094,/* 0x94 */
  673. 0x0095,/* 0x95 */
  674. 0x0096,/* 0x96 */
  675. 0x0097,/* 0x97 */
  676. 0x0098,/* 0x98 */
  677. 0x0099,/* 0x99 */
  678. 0x009a,/* 0x9a */
  679. 0x009b,/* 0x9b */
  680. 0x009c,/* 0x9c */
  681. 0x009d,/* 0x9d */
  682. 0x009e,/* 0x9e */
  683. 0x009f,/* 0x9f */
  684. 0x00A0,/* 0xa0 */
  685. 0x0901,/* 0xa1 */
  686. 0x0902,/* 0xa2 */
  687. 0x0903,/* 0xa3 */
  688. 0x0905,/* 0xa4 */
  689. 0x0906,/* 0xa5 */
  690. 0x0907,/* 0xa6 */
  691. 0x0908,/* 0xa7 */
  692. 0x0909,/* 0xa8 */
  693. 0x090a,/* 0xa9 */
  694. 0x090b,/* 0xaa */
  695. 0x090e,/* 0xab */
  696. 0x090f,/* 0xac */
  697. 0x0910,/* 0xad */
  698. 0x090d,/* 0xae */
  699. 0x0912,/* 0xaf */
  700. 0x0913,/* 0xb0 */
  701. 0x0914,/* 0xb1 */
  702. 0x0911,/* 0xb2 */
  703. 0x0915,/* 0xb3 */
  704. 0x0916,/* 0xb4 */
  705. 0x0917,/* 0xb5 */
  706. 0x0918,/* 0xb6 */
  707. 0x0919,/* 0xb7 */
  708. 0x091a,/* 0xb8 */
  709. 0x091b,/* 0xb9 */
  710. 0x091c,/* 0xba */
  711. 0x091d,/* 0xbb */
  712. 0x091e,/* 0xbc */
  713. 0x091f,/* 0xbd */
  714. 0x0920,/* 0xbe */
  715. 0x0921,/* 0xbf */
  716. 0x0922,/* 0xc0 */
  717. 0x0923,/* 0xc1 */
  718. 0x0924,/* 0xc2 */
  719. 0x0925,/* 0xc3 */
  720. 0x0926,/* 0xc4 */
  721. 0x0927,/* 0xc5 */
  722. 0x0928,/* 0xc6 */
  723. 0x0929,/* 0xc7 */
  724. 0x092a,/* 0xc8 */
  725. 0x092b,/* 0xc9 */
  726. 0x092c,/* 0xca */
  727. 0x092d,/* 0xcb */
  728. 0x092e,/* 0xcc */
  729. 0x092f,/* 0xcd */
  730. 0x095f,/* 0xce */
  731. 0x0930,/* 0xcf */
  732. 0x0931,/* 0xd0 */
  733. 0x0932,/* 0xd1 */
  734. 0x0933,/* 0xd2 */
  735. 0x0934,/* 0xd3 */
  736. 0x0935,/* 0xd4 */
  737. 0x0936,/* 0xd5 */
  738. 0x0937,/* 0xd6 */
  739. 0x0938,/* 0xd7 */
  740. 0x0939,/* 0xd8 */
  741. 0x200D,/* 0xd9 */
  742. 0x093e,/* 0xda */
  743. 0x093f,/* 0xdb */
  744. 0x0940,/* 0xdc */
  745. 0x0941,/* 0xdd */
  746. 0x0942,/* 0xde */
  747. 0x0943,/* 0xdf */
  748. 0x0946,/* 0xe0 */
  749. 0x0947,/* 0xe1 */
  750. 0x0948,/* 0xe2 */
  751. 0x0945,/* 0xe3 */
  752. 0x094a,/* 0xe4 */
  753. 0x094b,/* 0xe5 */
  754. 0x094c,/* 0xe6 */
  755. 0x0949,/* 0xe7 */
  756. 0x094d,/* 0xe8 */
  757. 0x093c,/* 0xe9 */
  758. 0x0964,/* 0xea */
  759. 0xFFFF,/* 0xeb */
  760. 0xFFFF,/* 0xec */
  761. 0xFFFF,/* 0xed */
  762. 0xFFFF,/* 0xee */
  763. 0xFFFF,/* 0xef */
  764. 0xFFFF,/* 0xf0 */
  765. 0x0966,/* 0xf1 */
  766. 0x0967,/* 0xf2 */
  767. 0x0968,/* 0xf3 */
  768. 0x0969,/* 0xf4 */
  769. 0x096a,/* 0xf5 */
  770. 0x096b,/* 0xf6 */
  771. 0x096c,/* 0xf7 */
  772. 0x096d,/* 0xf8 */
  773. 0x096e,/* 0xf9 */
  774. 0x096f,/* 0xfa */
  775. 0xFFFF,/* 0xfb */
  776. 0xFFFF,/* 0xfc */
  777. 0xFFFF,/* 0xfd */
  778. 0xFFFF,/* 0xfe */
  779. 0xFFFF /* 0xff */
  780. };
  781. static const uint16_t vowelSignESpecialCases[][2]={
  782. { 2 /*length of array*/ , 0 },
  783. { 0xA4 , 0x0904 },
  784. };
  785. static const uint16_t nuktaSpecialCases[][2]={
  786. { 16 /*length of array*/ , 0 },
  787. { 0xA6 , 0x090c },
  788. { 0xEA , 0x093D },
  789. { 0xDF , 0x0944 },
  790. { 0xA1 , 0x0950 },
  791. { 0xb3 , 0x0958 },
  792. { 0xb4 , 0x0959 },
  793. { 0xb5 , 0x095a },
  794. { 0xba , 0x095b },
  795. { 0xbf , 0x095c },
  796. { 0xC0 , 0x095d },
  797. { 0xc9 , 0x095e },
  798. { 0xAA , 0x0960 },
  799. { 0xA7 , 0x0961 },
  800. { 0xDB , 0x0962 },
  801. { 0xDC , 0x0963 },
  802. };
  803. #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err) UPRV_BLOCK_MACRO_BEGIN { \
  804. int32_t offset = (int32_t)(source - args->source-1); \
  805. /* write the targetUniChar to target */ \
  806. if(target < targetLimit){ \
  807. if(targetByteUnit <= 0xFF){ \
  808. *(target)++ = (uint8_t)(targetByteUnit); \
  809. if(offsets){ \
  810. *(offsets++) = offset; \
  811. } \
  812. }else{ \
  813. if (targetByteUnit > 0xFFFF) { \
  814. *(target)++ = (uint8_t)(targetByteUnit>>16); \
  815. if (offsets) { \
  816. --offset; \
  817. *(offsets++) = offset; \
  818. } \
  819. } \
  820. if (!(target < targetLimit)) { \
  821. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
  822. (uint8_t)(targetByteUnit >> 8); \
  823. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
  824. (uint8_t)targetByteUnit; \
  825. *err = U_BUFFER_OVERFLOW_ERROR; \
  826. } else { \
  827. *(target)++ = (uint8_t)(targetByteUnit>>8); \
  828. if(offsets){ \
  829. *(offsets++) = offset; \
  830. } \
  831. if(target < targetLimit){ \
  832. *(target)++ = (uint8_t) targetByteUnit; \
  833. if(offsets){ \
  834. *(offsets++) = offset ; \
  835. } \
  836. }else{ \
  837. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\
  838. (uint8_t) (targetByteUnit); \
  839. *err = U_BUFFER_OVERFLOW_ERROR; \
  840. } \
  841. } \
  842. } \
  843. }else{ \
  844. if (targetByteUnit & 0xFF0000) { \
  845. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
  846. (uint8_t) (targetByteUnit >>16); \
  847. } \
  848. if(targetByteUnit & 0xFF00){ \
  849. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
  850. (uint8_t) (targetByteUnit >>8); \
  851. } \
  852. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
  853. (uint8_t) (targetByteUnit); \
  854. *err = U_BUFFER_OVERFLOW_ERROR; \
  855. } \
  856. } UPRV_BLOCK_MACRO_END
  857. /* Rules:
  858. * Explicit Halant :
  859. * <HALANT> + <ZWNJ>
  860. * Soft Halant :
  861. * <HALANT> + <ZWJ>
  862. */
  863. static void U_CALLCONV
  864. UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
  865. UConverterFromUnicodeArgs * args, UErrorCode * err) {
  866. const char16_t *source = args->source;
  867. const char16_t *sourceLimit = args->sourceLimit;
  868. unsigned char *target = (unsigned char *) args->target;
  869. unsigned char *targetLimit = (unsigned char *) args->targetLimit;
  870. int32_t* offsets = args->offsets;
  871. uint32_t targetByteUnit = 0x0000;
  872. UChar32 sourceChar = 0x0000;
  873. UChar32 tempContextFromUnicode = 0x0000; /* For special handling of the Gurmukhi script. */
  874. UConverterDataISCII *converterData;
  875. uint16_t newDelta=0;
  876. uint16_t range = 0;
  877. UBool deltaChanged = false;
  878. if ((args->converter == nullptr) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)) {
  879. *err = U_ILLEGAL_ARGUMENT_ERROR;
  880. return;
  881. }
  882. /* initialize data */
  883. converterData=(UConverterDataISCII*)args->converter->extraInfo;
  884. newDelta=converterData->currentDeltaFromUnicode;
  885. range = (uint16_t)(newDelta/DELTA);
  886. if ((sourceChar = args->converter->fromUChar32)!=0) {
  887. goto getTrail;
  888. }
  889. /*writing the char to the output stream */
  890. while (source < sourceLimit) {
  891. /* Write the language code following LF only if LF is not the last character. */
  892. if (args->converter->fromUnicodeStatus == LF) {
  893. targetByteUnit = ATR<<8;
  894. targetByteUnit += (uint8_t) lookupInitialData[range].isciiLang;
  895. args->converter->fromUnicodeStatus = 0x0000;
  896. /* now append ATR and language code */
  897. WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err);
  898. if (U_FAILURE(*err)) {
  899. break;
  900. }
  901. }
  902. sourceChar = *source++;
  903. tempContextFromUnicode = converterData->contextCharFromUnicode;
  904. targetByteUnit = missingCharMarker;
  905. /*check if input is in ASCII and C0 control codes range*/
  906. if (sourceChar <= ASCII_END) {
  907. args->converter->fromUnicodeStatus = sourceChar;
  908. WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,sourceChar,err);
  909. if (U_FAILURE(*err)) {
  910. break;
  911. }
  912. continue;
  913. }
  914. switch (sourceChar) {
  915. case ZWNJ:
  916. /* contextChar has HALANT */
  917. if (converterData->contextCharFromUnicode) {
  918. converterData->contextCharFromUnicode = 0x00;
  919. targetByteUnit = ISCII_HALANT;
  920. } else {
  921. /* consume ZWNJ and continue */
  922. converterData->contextCharFromUnicode = 0x00;
  923. continue;
  924. }
  925. break;
  926. case ZWJ:
  927. /* contextChar has HALANT */
  928. if (converterData->contextCharFromUnicode) {
  929. targetByteUnit = ISCII_NUKTA;
  930. } else {
  931. targetByteUnit =ISCII_INV;
  932. }
  933. converterData->contextCharFromUnicode = 0x00;
  934. break;
  935. default:
  936. /* is the sourceChar in the INDIC_RANGE? */
  937. if ((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE) {
  938. /* Danda and Double Danda are valid in Northern scripts.. since Unicode
  939. * does not include these codepoints in all Northern scrips we need to
  940. * filter them out
  941. */
  942. if (sourceChar!= DANDA && sourceChar != DOUBLE_DANDA) {
  943. /* find out to which block the souceChar belongs*/
  944. range =(uint16_t)((sourceChar-INDIC_BLOCK_BEGIN)/DELTA);
  945. newDelta =(uint16_t)(range*DELTA);
  946. /* Now are we in the same block as the previous? */
  947. if (newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer) {
  948. converterData->currentDeltaFromUnicode = newDelta;
  949. converterData->currentMaskFromUnicode = lookupInitialData[range].maskEnum;
  950. deltaChanged =true;
  951. converterData->isFirstBuffer=false;
  952. }
  953. if (converterData->currentDeltaFromUnicode == PNJ_DELTA) {
  954. if (sourceChar == PNJ_TIPPI) {
  955. /* Make sure Tippi is converted to Bindi. */
  956. sourceChar = PNJ_BINDI;
  957. } else if (sourceChar == PNJ_ADHAK) {
  958. /* This is for consonant cluster handling. */
  959. converterData->contextCharFromUnicode = PNJ_ADHAK;
  960. }
  961. }
  962. /* Normalize all Indic codepoints to Devanagari and map them to ISCII */
  963. /* now subtract the new delta from sourceChar*/
  964. sourceChar -= converterData->currentDeltaFromUnicode;
  965. }
  966. /* get the target byte unit */
  967. targetByteUnit=fromUnicodeTable[(uint8_t)sourceChar];
  968. /* is the code point valid in current script? */
  969. if ((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0) {
  970. /* Vocallic RR is assigned in ISCII Telugu and Unicode */
  971. if (converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) || sourceChar!=VOCALLIC_RR) {
  972. targetByteUnit=missingCharMarker;
  973. }
  974. }
  975. if (deltaChanged) {
  976. /* we are in a script block which is different than
  977. * previous sourceChar's script block write ATR and language codes
  978. */
  979. uint32_t temp=0;
  980. temp =(uint16_t)(ATR<<8);
  981. temp += (uint16_t)((uint8_t) lookupInitialData[range].isciiLang);
  982. /* reset */
  983. deltaChanged=false;
  984. /* now append ATR and language code */
  985. WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,temp,err);
  986. if (U_FAILURE(*err)) {
  987. break;
  988. }
  989. }
  990. if (converterData->currentDeltaFromUnicode == PNJ_DELTA && (sourceChar + PNJ_DELTA) == PNJ_ADHAK) {
  991. continue;
  992. }
  993. }
  994. /* reset context char */
  995. converterData->contextCharFromUnicode = 0x00;
  996. break;
  997. }
  998. if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) {
  999. /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */
  1000. /* reset context char */
  1001. converterData->contextCharFromUnicode = 0x0000;
  1002. targetByteUnit = targetByteUnit << 16 | ISCII_HALANT << 8 | targetByteUnit;
  1003. /* write targetByteUnit to target */
  1004. WRITE_TO_TARGET_FROM_U(args, offsets, source, target, targetLimit, targetByteUnit,err);
  1005. if (U_FAILURE(*err)) {
  1006. break;
  1007. }
  1008. } else if (targetByteUnit != missingCharMarker) {
  1009. if (targetByteUnit==ISCII_HALANT) {
  1010. converterData->contextCharFromUnicode = (char16_t)targetByteUnit;
  1011. }
  1012. /* write targetByteUnit to target*/
  1013. WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err);
  1014. if (U_FAILURE(*err)) {
  1015. break;
  1016. }
  1017. } else {
  1018. /* oops.. the code point is unassigned */
  1019. /*check if the char is a First surrogate*/
  1020. if (U16_IS_SURROGATE(sourceChar)) {
  1021. if (U16_IS_SURROGATE_LEAD(sourceChar)) {
  1022. getTrail:
  1023. /*look ahead to find the trail surrogate*/
  1024. if (source < sourceLimit) {
  1025. /* test the following code unit */
  1026. char16_t trail= (*source);
  1027. if (U16_IS_TRAIL(trail)) {
  1028. source++;
  1029. sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
  1030. *err =U_INVALID_CHAR_FOUND;
  1031. /* convert this surrogate code point */
  1032. /* exit this condition tree */
  1033. } else {
  1034. /* this is an unmatched lead code unit (1st surrogate) */
  1035. /* callback(illegal) */
  1036. *err=U_ILLEGAL_CHAR_FOUND;
  1037. }
  1038. } else {
  1039. /* no more input */
  1040. *err = U_ZERO_ERROR;
  1041. }
  1042. } else {
  1043. /* this is an unmatched trail code unit (2nd surrogate) */
  1044. /* callback(illegal) */
  1045. *err=U_ILLEGAL_CHAR_FOUND;
  1046. }
  1047. } else {
  1048. /* callback(unassigned) for a BMP code point */
  1049. *err = U_INVALID_CHAR_FOUND;
  1050. }
  1051. args->converter->fromUChar32=sourceChar;
  1052. break;
  1053. }
  1054. }/* end while(mySourceIndex<mySourceLength) */
  1055. /*save the state and return */
  1056. args->source = source;
  1057. args->target = (char*)target;
  1058. }
  1059. static const uint16_t lookupTable[][2]={
  1060. { ZERO, ZERO }, /*DEFAULT*/
  1061. { ZERO, ZERO }, /*ROMAN*/
  1062. { DEVANAGARI, DEV_MASK },
  1063. { BENGALI, BNG_MASK },
  1064. { TAMIL, TML_MASK },
  1065. { TELUGU, KND_MASK },
  1066. { BENGALI, BNG_MASK },
  1067. { ORIYA, ORI_MASK },
  1068. { KANNADA, KND_MASK },
  1069. { MALAYALAM, MLM_MASK },
  1070. { GUJARATI, GJR_MASK },
  1071. { GURMUKHI, PNJ_MASK }
  1072. };
  1073. #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err) UPRV_BLOCK_MACRO_BEGIN { \
  1074. /* add offset to current Indic Block */ \
  1075. if(targetUniChar>ASCII_END && \
  1076. targetUniChar != ZWJ && \
  1077. targetUniChar != ZWNJ && \
  1078. targetUniChar != DANDA && \
  1079. targetUniChar != DOUBLE_DANDA){ \
  1080. \
  1081. targetUniChar+=(uint16_t)(delta); \
  1082. } \
  1083. /* now write the targetUniChar */ \
  1084. if(target<args->targetLimit){ \
  1085. *(target)++ = (char16_t)targetUniChar; \
  1086. if(offsets){ \
  1087. *(offsets)++ = (int32_t)(offset); \
  1088. } \
  1089. }else{ \
  1090. args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \
  1091. (char16_t)targetUniChar; \
  1092. *err = U_BUFFER_OVERFLOW_ERROR; \
  1093. } \
  1094. } UPRV_BLOCK_MACRO_END
  1095. #define GET_MAPPING(sourceChar,targetUniChar,data) UPRV_BLOCK_MACRO_BEGIN { \
  1096. targetUniChar = toUnicodeTable[(sourceChar)] ; \
  1097. /* is the code point valid in current script? */ \
  1098. if(sourceChar> ASCII_END && \
  1099. (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \
  1100. /* Vocallic RR is assigned in ISCII Telugu and Unicode */ \
  1101. if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \
  1102. targetUniChar!=VOCALLIC_RR){ \
  1103. targetUniChar=missingCharMarker; \
  1104. } \
  1105. } \
  1106. } UPRV_BLOCK_MACRO_END
  1107. /***********
  1108. * Rules for ISCII to Unicode converter
  1109. * ISCII is stateful encoding. To convert ISCII bytes to Unicode,
  1110. * which has both precomposed and decomposed forms characters
  1111. * pre-context and post-context need to be considered.
  1112. *
  1113. * Post context
  1114. * i) ATR : Attribute code is used to declare the font and script switching.
  1115. * Currently we only switch scripts and font codes consumed without generating an error
  1116. * ii) EXT : Extension code is used to declare switching to Sanskrit and for obscure,
  1117. * obsolete characters
  1118. * Pre context
  1119. * i) Halant: if preceded by a halant then it is a explicit halant
  1120. * ii) Nukta :
  1121. * a) if preceded by a halant then it is a soft halant
  1122. * b) if preceded by specific consonants and the ligatures have pre-composed
  1123. * characters in Unicode then convert to pre-composed characters
  1124. * iii) Danda: If Danda is preceded by a Danda then convert to Double Danda
  1125. *
  1126. */
  1127. static void U_CALLCONV
  1128. UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err) {
  1129. const char *source = ( char *) args->source;
  1130. char16_t *target = args->target;
  1131. const char *sourceLimit = args->sourceLimit;
  1132. const char16_t* targetLimit = args->targetLimit;
  1133. uint32_t targetUniChar = 0x0000;
  1134. uint8_t sourceChar = 0x0000;
  1135. UConverterDataISCII* data;
  1136. UChar32* toUnicodeStatus=nullptr;
  1137. UChar32 tempTargetUniChar = 0x0000;
  1138. char16_t* contextCharToUnicode= nullptr;
  1139. UBool found;
  1140. int i;
  1141. int offset = 0;
  1142. if ((args->converter == nullptr) || (target < args->target) || (source < args->source)) {
  1143. *err = U_ILLEGAL_ARGUMENT_ERROR;
  1144. return;
  1145. }
  1146. data = (UConverterDataISCII*)(args->converter->extraInfo);
  1147. contextCharToUnicode = &data->contextCharToUnicode; /* contains previous ISCII codepoint visited */
  1148. toUnicodeStatus = (UChar32*)&args->converter->toUnicodeStatus;/* contains the mapping to Unicode of the above codepoint*/
  1149. while (U_SUCCESS(*err) && source<sourceLimit) {
  1150. targetUniChar = missingCharMarker;
  1151. if (target < targetLimit) {
  1152. sourceChar = (unsigned char)*(source)++;
  1153. /* look at the post-context perform special processing */
  1154. if (*contextCharToUnicode==ATR) {
  1155. /* If we have ATR in *contextCharToUnicode then we need to change our
  1156. * state to the Indic Script specified by sourceChar
  1157. */
  1158. /* check if the sourceChar is supported script range*/
  1159. if ((uint8_t)(PNJ-sourceChar)<=PNJ-DEV) {
  1160. data->currentDeltaToUnicode = (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA);
  1161. data->currentMaskToUnicode = (MaskEnum)lookupTable[sourceChar & 0x0F][1];
  1162. } else if (sourceChar==DEF) {
  1163. /* switch back to default */
  1164. data->currentDeltaToUnicode = data->defDeltaToUnicode;
  1165. data->currentMaskToUnicode = data->defMaskToUnicode;
  1166. } else {
  1167. if ((sourceChar >= 0x21 && sourceChar <= 0x3F)) {
  1168. /* these are display codes consume and continue */
  1169. } else {
  1170. *err =U_ILLEGAL_CHAR_FOUND;
  1171. /* reset */
  1172. *contextCharToUnicode=NO_CHAR_MARKER;
  1173. goto CALLBACK;
  1174. }
  1175. }
  1176. /* reset */
  1177. *contextCharToUnicode=NO_CHAR_MARKER;
  1178. continue;
  1179. } else if (*contextCharToUnicode==EXT) {
  1180. /* check if sourceChar is in 0xA1-0xEE range */
  1181. if ((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)) {
  1182. /* We currently support only Anudatta and Devanagari abbreviation sign */
  1183. if (sourceChar==0xBF || sourceChar == 0xB8) {
  1184. targetUniChar = (sourceChar==0xBF) ? DEV_ABBR_SIGN : DEV_ANUDATTA;
  1185. /* find out if the mapping is valid in this state */
  1186. if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
  1187. *contextCharToUnicode= NO_CHAR_MARKER;
  1188. /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
  1189. if (data->prevToUnicodeStatus) {
  1190. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
  1191. data->prevToUnicodeStatus = 0x0000;
  1192. }
  1193. /* write to target */
  1194. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
  1195. continue;
  1196. }
  1197. }
  1198. /* byte unit is unassigned */
  1199. targetUniChar = missingCharMarker;
  1200. *err= U_INVALID_CHAR_FOUND;
  1201. } else {
  1202. /* only 0xA1 - 0xEE are legal after EXT char */
  1203. *contextCharToUnicode= NO_CHAR_MARKER;
  1204. *err = U_ILLEGAL_CHAR_FOUND;
  1205. }
  1206. goto CALLBACK;
  1207. } else if (*contextCharToUnicode==ISCII_INV) {
  1208. if (sourceChar==ISCII_HALANT) {
  1209. targetUniChar = 0x0020; /* replace with space according to Indic FAQ */
  1210. } else {
  1211. targetUniChar = ZWJ;
  1212. }
  1213. /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
  1214. if (data->prevToUnicodeStatus) {
  1215. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
  1216. data->prevToUnicodeStatus = 0x0000;
  1217. }
  1218. /* write to target */
  1219. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
  1220. /* reset */
  1221. *contextCharToUnicode=NO_CHAR_MARKER;
  1222. }
  1223. /* look at the pre-context and perform special processing */
  1224. switch (sourceChar) {
  1225. case ISCII_INV:
  1226. case EXT:
  1227. case ATR:
  1228. *contextCharToUnicode = (char16_t)sourceChar;
  1229. if (*toUnicodeStatus != missingCharMarker) {
  1230. /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
  1231. if (data->prevToUnicodeStatus) {
  1232. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
  1233. data->prevToUnicodeStatus = 0x0000;
  1234. }
  1235. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err);
  1236. *toUnicodeStatus = missingCharMarker;
  1237. }
  1238. continue;
  1239. case ISCII_DANDA:
  1240. /* handle double danda*/
  1241. if (*contextCharToUnicode== ISCII_DANDA) {
  1242. targetUniChar = DOUBLE_DANDA;
  1243. /* clear the context */
  1244. *contextCharToUnicode = NO_CHAR_MARKER;
  1245. *toUnicodeStatus = missingCharMarker;
  1246. } else {
  1247. GET_MAPPING(sourceChar,targetUniChar,data);
  1248. *contextCharToUnicode = sourceChar;
  1249. }
  1250. break;
  1251. case ISCII_HALANT:
  1252. /* handle explicit halant */
  1253. if (*contextCharToUnicode == ISCII_HALANT) {
  1254. targetUniChar = ZWNJ;
  1255. /* clear the context */
  1256. *contextCharToUnicode = NO_CHAR_MARKER;
  1257. } else {
  1258. GET_MAPPING(sourceChar,targetUniChar,data);
  1259. *contextCharToUnicode = sourceChar;
  1260. }
  1261. break;
  1262. case 0x0A:
  1263. case 0x0D:
  1264. data->resetToDefaultToUnicode = true;
  1265. GET_MAPPING(sourceChar,targetUniChar,data)
  1266. ;
  1267. *contextCharToUnicode = sourceChar;
  1268. break;
  1269. case ISCII_VOWEL_SIGN_E:
  1270. i=1;
  1271. found=false;
  1272. for (; i<vowelSignESpecialCases[0][0]; i++) {
  1273. U_ASSERT(i<UPRV_LENGTHOF(vowelSignESpecialCases));
  1274. if (vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode) {
  1275. targetUniChar=vowelSignESpecialCases[i][1];
  1276. found=true;
  1277. break;
  1278. }
  1279. }
  1280. if (found) {
  1281. /* find out if the mapping is valid in this state */
  1282. if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
  1283. /*targetUniChar += data->currentDeltaToUnicode ;*/
  1284. *contextCharToUnicode= NO_CHAR_MARKER;
  1285. *toUnicodeStatus = missingCharMarker;
  1286. break;
  1287. }
  1288. }
  1289. GET_MAPPING(sourceChar,targetUniChar,data);
  1290. *contextCharToUnicode = sourceChar;
  1291. break;
  1292. case ISCII_NUKTA:
  1293. /* handle soft halant */
  1294. if (*contextCharToUnicode == ISCII_HALANT) {
  1295. targetUniChar = ZWJ;
  1296. /* clear the context */
  1297. *contextCharToUnicode = NO_CHAR_MARKER;
  1298. break;
  1299. } else if (data->currentDeltaToUnicode == PNJ_DELTA && data->contextCharToUnicode == 0xc0) {
  1300. /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
  1301. if (data->prevToUnicodeStatus) {
  1302. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
  1303. data->prevToUnicodeStatus = 0x0000;
  1304. }
  1305. /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi.
  1306. * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39).
  1307. */
  1308. targetUniChar = PNJ_RRA;
  1309. WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
  1310. if (U_SUCCESS(*err)) {
  1311. targetUniChar = PNJ_SIGN_VIRAMA;
  1312. WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
  1313. if (U_SUCCESS(*err)) {
  1314. targetUniChar = PNJ_HA;
  1315. WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
  1316. } else {
  1317. args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA;
  1318. }
  1319. } else {
  1320. args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_SIGN_VIRAMA;
  1321. args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA;
  1322. }
  1323. *toUnicodeStatus = missingCharMarker;
  1324. data->contextCharToUnicode = NO_CHAR_MARKER;
  1325. continue;
  1326. } else {
  1327. /* try to handle <CHAR> + ISCII_NUKTA special mappings */
  1328. i=1;
  1329. found =false;
  1330. for (; i<nuktaSpecialCases[0][0]; i++) {
  1331. if (nuktaSpecialCases[i][0]==(uint8_t)
  1332. *contextCharToUnicode) {
  1333. targetUniChar=nuktaSpecialCases[i][1];
  1334. found =true;
  1335. break;
  1336. }
  1337. }
  1338. if (found) {
  1339. /* find out if the mapping is valid in this state */
  1340. if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
  1341. /*targetUniChar += data->currentDeltaToUnicode ;*/
  1342. *contextCharToUnicode= NO_CHAR_MARKER;
  1343. *toUnicodeStatus = missingCharMarker;
  1344. if (data->currentDeltaToUnicode == PNJ_DELTA) {
  1345. /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
  1346. if (data->prevToUnicodeStatus) {
  1347. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
  1348. data->prevToUnicodeStatus = 0x0000;
  1349. }
  1350. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
  1351. continue;
  1352. }
  1353. break;
  1354. }
  1355. /* else fall through to default */
  1356. }
  1357. /* else fall through to default */
  1358. U_FALLTHROUGH;
  1359. }
  1360. default:GET_MAPPING(sourceChar,targetUniChar,data)
  1361. ;
  1362. *contextCharToUnicode = sourceChar;
  1363. break;
  1364. }
  1365. if (*toUnicodeStatus != missingCharMarker) {
  1366. /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */
  1367. if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) &&
  1368. (*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && ((UChar32)(targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus)) {
  1369. /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */
  1370. offset = (int)(source-args->source - 3);
  1371. tempTargetUniChar = PNJ_ADHAK; /* This is necessary to avoid some compiler warnings. */
  1372. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,tempTargetUniChar,0,err);
  1373. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,data->prevToUnicodeStatus,0,err);
  1374. data->prevToUnicodeStatus = 0x0000; /* reset the previous unicode code point */
  1375. *toUnicodeStatus = missingCharMarker;
  1376. continue;
  1377. } else {
  1378. /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
  1379. if (data->prevToUnicodeStatus) {
  1380. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
  1381. data->prevToUnicodeStatus = 0x0000;
  1382. }
  1383. /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script.
  1384. * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi.
  1385. */
  1386. if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && isPNJBindiTippi((*toUnicodeStatus + PNJ_DELTA))) {
  1387. targetUniChar = PNJ_TIPPI - PNJ_DELTA;
  1388. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,PNJ_DELTA,err);
  1389. } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && isPNJConsonant((*toUnicodeStatus + PNJ_DELTA))) {
  1390. /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */
  1391. data->prevToUnicodeStatus = *toUnicodeStatus + PNJ_DELTA;
  1392. } else {
  1393. /* write the previously mapped codepoint */
  1394. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err);
  1395. }
  1396. }
  1397. *toUnicodeStatus = missingCharMarker;
  1398. }
  1399. if (targetUniChar != missingCharMarker) {
  1400. /* now save the targetUniChar for delayed write */
  1401. *toUnicodeStatus = (char16_t) targetUniChar;
  1402. if (data->resetToDefaultToUnicode) {
  1403. data->currentDeltaToUnicode = data->defDeltaToUnicode;
  1404. data->currentMaskToUnicode = data->defMaskToUnicode;
  1405. data->resetToDefaultToUnicode=false;
  1406. }
  1407. } else {
  1408. /* we reach here only if targetUniChar == missingCharMarker
  1409. * so assign codes to reason and err
  1410. */
  1411. *err = U_INVALID_CHAR_FOUND;
  1412. CALLBACK:
  1413. args->converter->toUBytes[0] = sourceChar;
  1414. args->converter->toULength = 1;
  1415. break;
  1416. }
  1417. } else {
  1418. *err =U_BUFFER_OVERFLOW_ERROR;
  1419. break;
  1420. }
  1421. }
  1422. if (U_SUCCESS(*err) && args->flush && source == sourceLimit) {
  1423. /* end of the input stream */
  1424. UConverter *cnv = args->converter;
  1425. if (*contextCharToUnicode==ATR || *contextCharToUnicode==EXT || *contextCharToUnicode==ISCII_INV) {
  1426. /* set toUBytes[] */
  1427. cnv->toUBytes[0] = (uint8_t)*contextCharToUnicode;
  1428. cnv->toULength = 1;
  1429. /* avoid looping on truncated sequences */
  1430. *contextCharToUnicode = NO_CHAR_MARKER;
  1431. } else {
  1432. cnv->toULength = 0;
  1433. }
  1434. if (*toUnicodeStatus != missingCharMarker) {
  1435. /* output a remaining target character */
  1436. WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1),*toUnicodeStatus,data->currentDeltaToUnicode,err);
  1437. *toUnicodeStatus = missingCharMarker;
  1438. }
  1439. }
  1440. args->target = target;
  1441. args->source = source;
  1442. }
  1443. /* structure for SafeClone calculations */
  1444. struct cloneISCIIStruct {
  1445. UConverter cnv;
  1446. UConverterDataISCII mydata;
  1447. };
  1448. static UConverter * U_CALLCONV
  1449. _ISCII_SafeClone(const UConverter *cnv,
  1450. void *stackBuffer,
  1451. int32_t *pBufferSize,
  1452. UErrorCode *status)
  1453. {
  1454. struct cloneISCIIStruct * localClone;
  1455. int32_t bufferSizeNeeded = sizeof(struct cloneISCIIStruct);
  1456. if (U_FAILURE(*status)) {
  1457. return nullptr;
  1458. }
  1459. if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
  1460. *pBufferSize = bufferSizeNeeded;
  1461. return nullptr;
  1462. }
  1463. localClone = (struct cloneISCIIStruct *)stackBuffer;
  1464. /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
  1465. uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII));
  1466. localClone->cnv.extraInfo = &localClone->mydata;
  1467. localClone->cnv.isExtraLocal = true;
  1468. return &localClone->cnv;
  1469. }
  1470. static void U_CALLCONV
  1471. _ISCIIGetUnicodeSet(const UConverter *cnv,
  1472. const USetAdder *sa,
  1473. UConverterUnicodeSet which,
  1474. UErrorCode *pErrorCode)
  1475. {
  1476. (void)cnv;
  1477. (void)which;
  1478. (void)pErrorCode;
  1479. int32_t idx, script;
  1480. uint8_t mask;
  1481. /* Since all ISCII versions allow switching to other ISCII
  1482. scripts, we add all roundtrippable characters to this set. */
  1483. sa->addRange(sa->set, 0, ASCII_END);
  1484. for (script = DEVANAGARI; script <= MALAYALAM; script++) {
  1485. mask = (uint8_t)(lookupInitialData[script].maskEnum);
  1486. for (idx = 0; idx < DELTA; idx++) {
  1487. /* added check for TELUGU character */
  1488. if ((validityTable[idx] & mask) || (script==TELUGU && idx==0x31)) {
  1489. sa->add(sa->set, idx + (script * DELTA) + INDIC_BLOCK_BEGIN);
  1490. }
  1491. }
  1492. }
  1493. sa->add(sa->set, DANDA);
  1494. sa->add(sa->set, DOUBLE_DANDA);
  1495. sa->add(sa->set, ZWNJ);
  1496. sa->add(sa->set, ZWJ);
  1497. }
  1498. U_CDECL_END
  1499. static const UConverterImpl _ISCIIImpl={
  1500. UCNV_ISCII,
  1501. nullptr,
  1502. nullptr,
  1503. _ISCIIOpen,
  1504. _ISCIIClose,
  1505. _ISCIIReset,
  1506. UConverter_toUnicode_ISCII_OFFSETS_LOGIC,
  1507. UConverter_toUnicode_ISCII_OFFSETS_LOGIC,
  1508. UConverter_fromUnicode_ISCII_OFFSETS_LOGIC,
  1509. UConverter_fromUnicode_ISCII_OFFSETS_LOGIC,
  1510. nullptr,
  1511. nullptr,
  1512. _ISCIIgetName,
  1513. nullptr,
  1514. _ISCII_SafeClone,
  1515. _ISCIIGetUnicodeSet,
  1516. nullptr,
  1517. nullptr
  1518. };
  1519. static const UConverterStaticData _ISCIIStaticData={
  1520. sizeof(UConverterStaticData),
  1521. "ISCII",
  1522. 0,
  1523. UCNV_IBM,
  1524. UCNV_ISCII,
  1525. 1,
  1526. 4,
  1527. { 0x1a, 0, 0, 0 },
  1528. 0x1,
  1529. false,
  1530. false,
  1531. 0x0,
  1532. 0x0,
  1533. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
  1534. };
  1535. const UConverterSharedData _ISCIIData=
  1536. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISCIIStaticData, &_ISCIIImpl);
  1537. #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */