brkeng.cpp 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ************************************************************************************
  5. * Copyright (C) 2006-2016, International Business Machines Corporation
  6. * and others. All Rights Reserved.
  7. ************************************************************************************
  8. */
  9. #include "unicode/utypes.h"
  10. #if !UCONFIG_NO_BREAK_ITERATION
  11. #include "unicode/uchar.h"
  12. #include "unicode/uniset.h"
  13. #include "unicode/chariter.h"
  14. #include "unicode/ures.h"
  15. #include "unicode/udata.h"
  16. #include "unicode/putil.h"
  17. #include "unicode/ustring.h"
  18. #include "unicode/uscript.h"
  19. #include "unicode/ucharstrie.h"
  20. #include "unicode/bytestrie.h"
  21. #include "brkeng.h"
  22. #include "cmemory.h"
  23. #include "dictbe.h"
  24. #include "lstmbe.h"
  25. #include "charstr.h"
  26. #include "dictionarydata.h"
  27. #include "mutex.h"
  28. #include "uvector.h"
  29. #include "umutex.h"
  30. #include "uresimp.h"
  31. #include "ubrkimpl.h"
  32. U_NAMESPACE_BEGIN
  33. /*
  34. ******************************************************************
  35. */
  36. LanguageBreakEngine::LanguageBreakEngine() {
  37. }
  38. LanguageBreakEngine::~LanguageBreakEngine() {
  39. }
  40. /*
  41. ******************************************************************
  42. */
  43. LanguageBreakFactory::LanguageBreakFactory() {
  44. }
  45. LanguageBreakFactory::~LanguageBreakFactory() {
  46. }
  47. /*
  48. ******************************************************************
  49. */
  50. UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
  51. (void)status;
  52. }
  53. UnhandledEngine::~UnhandledEngine() {
  54. delete fHandled;
  55. fHandled = nullptr;
  56. }
  57. UBool
  58. UnhandledEngine::handles(UChar32 c) const {
  59. return fHandled && fHandled->contains(c);
  60. }
  61. int32_t
  62. UnhandledEngine::findBreaks( UText *text,
  63. int32_t /* startPos */,
  64. int32_t endPos,
  65. UVector32 &/*foundBreaks*/,
  66. UBool /* isPhraseBreaking */,
  67. UErrorCode &status) const {
  68. if (U_FAILURE(status)) return 0;
  69. UChar32 c = utext_current32(text);
  70. while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
  71. utext_next32(text); // TODO: recast loop to work with post-increment operations.
  72. c = utext_current32(text);
  73. }
  74. return 0;
  75. }
  76. void
  77. UnhandledEngine::handleCharacter(UChar32 c) {
  78. if (fHandled == nullptr) {
  79. fHandled = new UnicodeSet();
  80. if (fHandled == nullptr) {
  81. return;
  82. }
  83. }
  84. if (!fHandled->contains(c)) {
  85. UErrorCode status = U_ZERO_ERROR;
  86. // Apply the entire script of the character.
  87. int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
  88. fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
  89. }
  90. }
  91. /*
  92. ******************************************************************
  93. */
  94. ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
  95. fEngines = 0;
  96. }
  97. ICULanguageBreakFactory::~ICULanguageBreakFactory() {
  98. if (fEngines != 0) {
  99. delete fEngines;
  100. }
  101. }
  102. U_NAMESPACE_END
  103. U_CDECL_BEGIN
  104. static void U_CALLCONV _deleteEngine(void *obj) {
  105. delete (const icu::LanguageBreakEngine *) obj;
  106. }
  107. U_CDECL_END
  108. U_NAMESPACE_BEGIN
  109. const LanguageBreakEngine *
  110. ICULanguageBreakFactory::getEngineFor(UChar32 c) {
  111. const LanguageBreakEngine *lbe = nullptr;
  112. UErrorCode status = U_ZERO_ERROR;
  113. static UMutex gBreakEngineMutex;
  114. Mutex m(&gBreakEngineMutex);
  115. if (fEngines == nullptr) {
  116. LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
  117. if (U_FAILURE(status) ) {
  118. // Note: no way to return error code to caller.
  119. return nullptr;
  120. }
  121. fEngines = engines.orphan();
  122. } else {
  123. int32_t i = fEngines->size();
  124. while (--i >= 0) {
  125. lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
  126. if (lbe != nullptr && lbe->handles(c)) {
  127. return lbe;
  128. }
  129. }
  130. }
  131. // We didn't find an engine. Create one.
  132. lbe = loadEngineFor(c);
  133. if (lbe != nullptr) {
  134. fEngines->push((void *)lbe, status);
  135. }
  136. return U_SUCCESS(status) ? lbe : nullptr;
  137. }
  138. const LanguageBreakEngine *
  139. ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
  140. UErrorCode status = U_ZERO_ERROR;
  141. UScriptCode code = uscript_getScript(c, &status);
  142. if (U_SUCCESS(status)) {
  143. const LanguageBreakEngine *engine = nullptr;
  144. // Try to use LSTM first
  145. const LSTMData *data = CreateLSTMDataForScript(code, status);
  146. if (U_SUCCESS(status)) {
  147. if (data != nullptr) {
  148. engine = CreateLSTMBreakEngine(code, data, status);
  149. if (U_SUCCESS(status) && engine != nullptr) {
  150. return engine;
  151. }
  152. if (engine != nullptr) {
  153. delete engine;
  154. engine = nullptr;
  155. } else {
  156. DeleteLSTMData(data);
  157. }
  158. }
  159. }
  160. status = U_ZERO_ERROR; // fallback to dictionary based
  161. DictionaryMatcher *m = loadDictionaryMatcherFor(code);
  162. if (m != nullptr) {
  163. switch(code) {
  164. case USCRIPT_THAI:
  165. engine = new ThaiBreakEngine(m, status);
  166. break;
  167. case USCRIPT_LAO:
  168. engine = new LaoBreakEngine(m, status);
  169. break;
  170. case USCRIPT_MYANMAR:
  171. engine = new BurmeseBreakEngine(m, status);
  172. break;
  173. case USCRIPT_KHMER:
  174. engine = new KhmerBreakEngine(m, status);
  175. break;
  176. #if !UCONFIG_NO_NORMALIZATION
  177. // CJK not available w/o normalization
  178. case USCRIPT_HANGUL:
  179. engine = new CjkBreakEngine(m, kKorean, status);
  180. break;
  181. // use same BreakEngine and dictionary for both Chinese and Japanese
  182. case USCRIPT_HIRAGANA:
  183. case USCRIPT_KATAKANA:
  184. case USCRIPT_HAN:
  185. engine = new CjkBreakEngine(m, kChineseJapanese, status);
  186. break;
  187. #if 0
  188. // TODO: Have to get some characters with script=common handled
  189. // by CjkBreakEngine (e.g. U+309B). Simply subjecting
  190. // them to CjkBreakEngine does not work. The engine has to
  191. // special-case them.
  192. case USCRIPT_COMMON:
  193. {
  194. UBlockCode block = ublock_getCode(code);
  195. if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
  196. engine = new CjkBreakEngine(dict, kChineseJapanese, status);
  197. break;
  198. }
  199. #endif
  200. #endif
  201. default:
  202. break;
  203. }
  204. if (engine == nullptr) {
  205. delete m;
  206. }
  207. else if (U_FAILURE(status)) {
  208. delete engine;
  209. engine = nullptr;
  210. }
  211. return engine;
  212. }
  213. }
  214. return nullptr;
  215. }
  216. DictionaryMatcher *
  217. ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
  218. UErrorCode status = U_ZERO_ERROR;
  219. // open root from brkitr tree.
  220. UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
  221. b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
  222. int32_t dictnlength = 0;
  223. const char16_t *dictfname =
  224. ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
  225. if (U_FAILURE(status)) {
  226. ures_close(b);
  227. return nullptr;
  228. }
  229. CharString dictnbuf;
  230. CharString ext;
  231. const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
  232. if (extStart != nullptr) {
  233. int32_t len = (int32_t)(extStart - dictfname);
  234. ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
  235. dictnlength = len;
  236. }
  237. dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
  238. ures_close(b);
  239. UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
  240. if (U_SUCCESS(status)) {
  241. // build trie
  242. const uint8_t *data = (const uint8_t *)udata_getMemory(file);
  243. const int32_t *indexes = (const int32_t *)data;
  244. const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
  245. const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
  246. DictionaryMatcher *m = nullptr;
  247. if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
  248. const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
  249. const char *characters = (const char *)(data + offset);
  250. m = new BytesDictionaryMatcher(characters, transform, file);
  251. }
  252. else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
  253. const char16_t *characters = (const char16_t *)(data + offset);
  254. m = new UCharsDictionaryMatcher(characters, file);
  255. }
  256. if (m == nullptr) {
  257. // no matcher exists to take ownership - either we are an invalid
  258. // type or memory allocation failed
  259. udata_close(file);
  260. }
  261. return m;
  262. } else if (dictfname != nullptr) {
  263. // we don't have a dictionary matcher.
  264. // returning nullptr here will cause us to fail to find a dictionary break engine, as expected
  265. status = U_ZERO_ERROR;
  266. return nullptr;
  267. }
  268. return nullptr;
  269. }
  270. U_NAMESPACE_END
  271. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */