usc_impl.cpp 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1999-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. *
  9. * File USC_IMPL.C
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 07/08/2002 Eric Mader Creation.
  15. ******************************************************************************
  16. */
  17. #include "unicode/uscript.h"
  18. #include "usc_impl.h"
  19. #include "cmemory.h"
  20. #define PAREN_STACK_DEPTH 32
  21. #define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
  22. #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
  23. #define INC(sp,count) (MOD((sp) + (count)))
  24. #define INC1(sp) (INC(sp, 1))
  25. #define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
  26. #define DEC1(sp) (DEC(sp, 1))
  27. #define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0)
  28. #define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun))
  29. #define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP])
  30. #define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0)
  31. struct ParenStackEntry
  32. {
  33. int32_t pairIndex;
  34. UScriptCode scriptCode;
  35. };
  36. struct UScriptRun
  37. {
  38. int32_t textLength;
  39. const char16_t *textArray;
  40. int32_t scriptStart;
  41. int32_t scriptLimit;
  42. UScriptCode scriptCode;
  43. struct ParenStackEntry parenStack[PAREN_STACK_DEPTH];
  44. int32_t parenSP;
  45. int32_t pushCount;
  46. int32_t fixupCount;
  47. };
  48. static int8_t highBit(int32_t value);
  49. static const UChar32 pairedChars[] = {
  50. 0x0028, 0x0029, /* ascii paired punctuation */
  51. 0x003c, 0x003e,
  52. 0x005b, 0x005d,
  53. 0x007b, 0x007d,
  54. 0x00ab, 0x00bb, /* guillemets */
  55. 0x2018, 0x2019, /* general punctuation */
  56. 0x201c, 0x201d,
  57. 0x2039, 0x203a,
  58. 0x3008, 0x3009, /* chinese paired punctuation */
  59. 0x300a, 0x300b,
  60. 0x300c, 0x300d,
  61. 0x300e, 0x300f,
  62. 0x3010, 0x3011,
  63. 0x3014, 0x3015,
  64. 0x3016, 0x3017,
  65. 0x3018, 0x3019,
  66. 0x301a, 0x301b
  67. };
  68. static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode)
  69. {
  70. scriptRun->pushCount = LIMIT_INC(scriptRun->pushCount);
  71. scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount);
  72. scriptRun->parenSP = INC1(scriptRun->parenSP);
  73. scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex;
  74. scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode;
  75. }
  76. static void pop(UScriptRun *scriptRun)
  77. {
  78. if (STACK_IS_EMPTY(scriptRun)) {
  79. return;
  80. }
  81. if (scriptRun->fixupCount > 0) {
  82. scriptRun->fixupCount -= 1;
  83. }
  84. scriptRun->pushCount -= 1;
  85. scriptRun->parenSP = DEC1(scriptRun->parenSP);
  86. /* If the stack is now empty, reset the stack
  87. pointers to their initial values.
  88. */
  89. if (STACK_IS_EMPTY(scriptRun)) {
  90. scriptRun->parenSP = -1;
  91. }
  92. }
  93. static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode)
  94. {
  95. int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount);
  96. while (scriptRun->fixupCount-- > 0) {
  97. fixupSP = INC1(fixupSP);
  98. scriptRun->parenStack[fixupSP].scriptCode = scriptCode;
  99. }
  100. }
  101. static int8_t
  102. highBit(int32_t value)
  103. {
  104. int8_t bit = 0;
  105. if (value <= 0) {
  106. return -32;
  107. }
  108. if (value >= 1 << 16) {
  109. value >>= 16;
  110. bit += 16;
  111. }
  112. if (value >= 1 << 8) {
  113. value >>= 8;
  114. bit += 8;
  115. }
  116. if (value >= 1 << 4) {
  117. value >>= 4;
  118. bit += 4;
  119. }
  120. if (value >= 1 << 2) {
  121. value >>= 2;
  122. bit += 2;
  123. }
  124. if (value >= 1 << 1) {
  125. //value >>= 1;
  126. bit += 1;
  127. }
  128. return bit;
  129. }
  130. static int32_t
  131. getPairIndex(UChar32 ch)
  132. {
  133. int32_t pairedCharCount = UPRV_LENGTHOF(pairedChars);
  134. int32_t pairedCharPower = 1 << highBit(pairedCharCount);
  135. int32_t pairedCharExtra = pairedCharCount - pairedCharPower;
  136. int32_t probe = pairedCharPower;
  137. int32_t pairIndex = 0;
  138. if (ch >= pairedChars[pairedCharExtra]) {
  139. pairIndex = pairedCharExtra;
  140. }
  141. while (probe > (1 << 0)) {
  142. probe >>= 1;
  143. if (ch >= pairedChars[pairIndex + probe]) {
  144. pairIndex += probe;
  145. }
  146. }
  147. if (pairedChars[pairIndex] != ch) {
  148. pairIndex = -1;
  149. }
  150. return pairIndex;
  151. }
  152. static UBool
  153. sameScript(UScriptCode scriptOne, UScriptCode scriptTwo)
  154. {
  155. return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
  156. }
  157. U_CAPI UScriptRun * U_EXPORT2
  158. uscript_openRun(const char16_t *src, int32_t length, UErrorCode *pErrorCode)
  159. {
  160. UScriptRun *result = nullptr;
  161. if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
  162. return nullptr;
  163. }
  164. result = (UScriptRun *)uprv_malloc(sizeof (UScriptRun));
  165. if (result == nullptr) {
  166. *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
  167. return nullptr;
  168. }
  169. uscript_setRunText(result, src, length, pErrorCode);
  170. /* Release the UScriptRun if uscript_setRunText() returns an error */
  171. if (U_FAILURE(*pErrorCode)) {
  172. uprv_free(result);
  173. result = nullptr;
  174. }
  175. return result;
  176. }
  177. U_CAPI void U_EXPORT2
  178. uscript_closeRun(UScriptRun *scriptRun)
  179. {
  180. if (scriptRun != nullptr) {
  181. uprv_free(scriptRun);
  182. }
  183. }
  184. U_CAPI void U_EXPORT2
  185. uscript_resetRun(UScriptRun *scriptRun)
  186. {
  187. if (scriptRun != nullptr) {
  188. scriptRun->scriptStart = 0;
  189. scriptRun->scriptLimit = 0;
  190. scriptRun->scriptCode = USCRIPT_INVALID_CODE;
  191. scriptRun->parenSP = -1;
  192. scriptRun->pushCount = 0;
  193. scriptRun->fixupCount = 0;
  194. }
  195. }
  196. U_CAPI void U_EXPORT2
  197. uscript_setRunText(UScriptRun *scriptRun, const char16_t *src, int32_t length, UErrorCode *pErrorCode)
  198. {
  199. if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
  200. return;
  201. }
  202. if (scriptRun == nullptr || length < 0 || ((src == nullptr) != (length == 0))) {
  203. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  204. return;
  205. }
  206. scriptRun->textArray = src;
  207. scriptRun->textLength = length;
  208. uscript_resetRun(scriptRun);
  209. }
  210. U_CAPI UBool U_EXPORT2
  211. uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript)
  212. {
  213. UErrorCode error = U_ZERO_ERROR;
  214. /* if we've fallen off the end of the text, we're done */
  215. if (scriptRun == nullptr || scriptRun->scriptLimit >= scriptRun->textLength) {
  216. return false;
  217. }
  218. SYNC_FIXUP(scriptRun);
  219. scriptRun->scriptCode = USCRIPT_COMMON;
  220. for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) {
  221. char16_t high = scriptRun->textArray[scriptRun->scriptLimit];
  222. UChar32 ch = high;
  223. UScriptCode sc;
  224. int32_t pairIndex;
  225. /*
  226. * if the character is a high surrogate and it's not the last one
  227. * in the text, see if it's followed by a low surrogate
  228. */
  229. if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) {
  230. char16_t low = scriptRun->textArray[scriptRun->scriptLimit + 1];
  231. /*
  232. * if it is followed by a low surrogate,
  233. * consume it and form the full character
  234. */
  235. if (low >= 0xDC00 && low <= 0xDFFF) {
  236. ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
  237. scriptRun->scriptLimit += 1;
  238. }
  239. }
  240. sc = uscript_getScript(ch, &error);
  241. pairIndex = getPairIndex(ch);
  242. /*
  243. * Paired character handling:
  244. *
  245. * if it's an open character, push it onto the stack.
  246. * if it's a close character, find the matching open on the
  247. * stack, and use that script code. Any non-matching open
  248. * characters above it on the stack will be poped.
  249. */
  250. if (pairIndex >= 0) {
  251. if ((pairIndex & 1) == 0) {
  252. push(scriptRun, pairIndex, scriptRun->scriptCode);
  253. } else {
  254. int32_t pi = pairIndex & ~1;
  255. while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) {
  256. pop(scriptRun);
  257. }
  258. if (STACK_IS_NOT_EMPTY(scriptRun)) {
  259. sc = TOP(scriptRun).scriptCode;
  260. }
  261. }
  262. }
  263. if (sameScript(scriptRun->scriptCode, sc)) {
  264. if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
  265. scriptRun->scriptCode = sc;
  266. fixup(scriptRun, scriptRun->scriptCode);
  267. }
  268. /*
  269. * if this character is a close paired character,
  270. * pop the matching open character from the stack
  271. */
  272. if (pairIndex >= 0 && (pairIndex & 1) != 0) {
  273. pop(scriptRun);
  274. }
  275. } else {
  276. /*
  277. * if the run broke on a surrogate pair,
  278. * end it before the high surrogate
  279. */
  280. if (ch >= 0x10000) {
  281. scriptRun->scriptLimit -= 1;
  282. }
  283. break;
  284. }
  285. }
  286. if (pRunStart != nullptr) {
  287. *pRunStart = scriptRun->scriptStart;
  288. }
  289. if (pRunLimit != nullptr) {
  290. *pRunLimit = scriptRun->scriptLimit;
  291. }
  292. if (pRunScript != nullptr) {
  293. *pRunScript = scriptRun->scriptCode;
  294. }
  295. return true;
  296. }