brkeng.cpp 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ************************************************************************************
  5. * Copyright (C) 2006-2016, International Business Machines Corporation
  6. * and others. All Rights Reserved.
  7. ************************************************************************************
  8. */
  9. #include "unicode/utypes.h"
  10. #if !UCONFIG_NO_BREAK_ITERATION
  11. #include "unicode/uchar.h"
  12. #include "unicode/uniset.h"
  13. #include "unicode/chariter.h"
  14. #include "unicode/ures.h"
  15. #include "unicode/udata.h"
  16. #include "unicode/putil.h"
  17. #include "unicode/ustring.h"
  18. #include "unicode/uscript.h"
  19. #include "unicode/ucharstrie.h"
  20. #include "unicode/bytestrie.h"
  21. #include "brkeng.h"
  22. #include "cmemory.h"
  23. #include "dictbe.h"
  24. #include "lstmbe.h"
  25. #include "charstr.h"
  26. #include "dictionarydata.h"
  27. #include "mutex.h"
  28. #include "uvector.h"
  29. #include "umutex.h"
  30. #include "uresimp.h"
  31. #include "ubrkimpl.h"
  32. U_NAMESPACE_BEGIN
  33. /*
  34. ******************************************************************
  35. */
  36. LanguageBreakEngine::LanguageBreakEngine() {
  37. }
  38. LanguageBreakEngine::~LanguageBreakEngine() {
  39. }
  40. /*
  41. ******************************************************************
  42. */
  43. LanguageBreakFactory::LanguageBreakFactory() {
  44. }
  45. LanguageBreakFactory::~LanguageBreakFactory() {
  46. }
  47. /*
  48. ******************************************************************
  49. */
  50. UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
  51. (void)status;
  52. }
  53. UnhandledEngine::~UnhandledEngine() {
  54. delete fHandled;
  55. fHandled = nullptr;
  56. }
  57. UBool
  58. UnhandledEngine::handles(UChar32 c) const {
  59. return fHandled && fHandled->contains(c);
  60. }
  61. int32_t
  62. UnhandledEngine::findBreaks( UText *text,
  63. int32_t /* startPos */,
  64. int32_t endPos,
  65. UVector32 &/*foundBreaks*/,
  66. UErrorCode &status) const {
  67. if (U_FAILURE(status)) return 0;
  68. UChar32 c = utext_current32(text);
  69. while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
  70. utext_next32(text); // TODO: recast loop to work with post-increment operations.
  71. c = utext_current32(text);
  72. }
  73. return 0;
  74. }
  75. void
  76. UnhandledEngine::handleCharacter(UChar32 c) {
  77. if (fHandled == nullptr) {
  78. fHandled = new UnicodeSet();
  79. if (fHandled == nullptr) {
  80. return;
  81. }
  82. }
  83. if (!fHandled->contains(c)) {
  84. UErrorCode status = U_ZERO_ERROR;
  85. // Apply the entire script of the character.
  86. int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
  87. fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
  88. }
  89. }
  90. /*
  91. ******************************************************************
  92. */
  93. ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
  94. fEngines = 0;
  95. }
  96. ICULanguageBreakFactory::~ICULanguageBreakFactory() {
  97. if (fEngines != 0) {
  98. delete fEngines;
  99. }
  100. }
  101. U_NAMESPACE_END
  102. U_CDECL_BEGIN
  103. static void U_CALLCONV _deleteEngine(void *obj) {
  104. delete (const icu::LanguageBreakEngine *) obj;
  105. }
  106. U_CDECL_END
  107. U_NAMESPACE_BEGIN
  108. const LanguageBreakEngine *
  109. ICULanguageBreakFactory::getEngineFor(UChar32 c) {
  110. const LanguageBreakEngine *lbe = NULL;
  111. UErrorCode status = U_ZERO_ERROR;
  112. static UMutex gBreakEngineMutex;
  113. Mutex m(&gBreakEngineMutex);
  114. if (fEngines == nullptr) {
  115. LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
  116. if (U_FAILURE(status) ) {
  117. // Note: no way to return error code to caller.
  118. return nullptr;
  119. }
  120. fEngines = engines.orphan();
  121. } else {
  122. int32_t i = fEngines->size();
  123. while (--i >= 0) {
  124. lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
  125. if (lbe != NULL && lbe->handles(c)) {
  126. return lbe;
  127. }
  128. }
  129. }
  130. // We didn't find an engine. Create one.
  131. lbe = loadEngineFor(c);
  132. if (lbe != nullptr) {
  133. fEngines->push((void *)lbe, status);
  134. }
  135. return U_SUCCESS(status) ? lbe : nullptr;
  136. }
  137. const LanguageBreakEngine *
  138. ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
  139. UErrorCode status = U_ZERO_ERROR;
  140. UScriptCode code = uscript_getScript(c, &status);
  141. if (U_SUCCESS(status)) {
  142. const LanguageBreakEngine *engine = nullptr;
  143. // Try to use LSTM first
  144. const LSTMData *data = CreateLSTMDataForScript(code, status);
  145. if (U_SUCCESS(status)) {
  146. if (data != nullptr) {
  147. engine = CreateLSTMBreakEngine(code, data, status);
  148. if (U_SUCCESS(status) && engine != nullptr) {
  149. return engine;
  150. }
  151. if (engine != nullptr) {
  152. delete engine;
  153. engine = nullptr;
  154. } else {
  155. DeleteLSTMData(data);
  156. }
  157. }
  158. }
  159. status = U_ZERO_ERROR; // fallback to dictionary based
  160. DictionaryMatcher *m = loadDictionaryMatcherFor(code);
  161. if (m != NULL) {
  162. switch(code) {
  163. case USCRIPT_THAI:
  164. engine = new ThaiBreakEngine(m, status);
  165. break;
  166. case USCRIPT_LAO:
  167. engine = new LaoBreakEngine(m, status);
  168. break;
  169. case USCRIPT_MYANMAR:
  170. engine = new BurmeseBreakEngine(m, status);
  171. break;
  172. case USCRIPT_KHMER:
  173. engine = new KhmerBreakEngine(m, status);
  174. break;
  175. #if !UCONFIG_NO_NORMALIZATION
  176. // CJK not available w/o normalization
  177. case USCRIPT_HANGUL:
  178. engine = new CjkBreakEngine(m, kKorean, status);
  179. break;
  180. // use same BreakEngine and dictionary for both Chinese and Japanese
  181. case USCRIPT_HIRAGANA:
  182. case USCRIPT_KATAKANA:
  183. case USCRIPT_HAN:
  184. engine = new CjkBreakEngine(m, kChineseJapanese, status);
  185. break;
  186. #if 0
  187. // TODO: Have to get some characters with script=common handled
  188. // by CjkBreakEngine (e.g. U+309B). Simply subjecting
  189. // them to CjkBreakEngine does not work. The engine has to
  190. // special-case them.
  191. case USCRIPT_COMMON:
  192. {
  193. UBlockCode block = ublock_getCode(code);
  194. if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
  195. engine = new CjkBreakEngine(dict, kChineseJapanese, status);
  196. break;
  197. }
  198. #endif
  199. #endif
  200. default:
  201. break;
  202. }
  203. if (engine == NULL) {
  204. delete m;
  205. }
  206. else if (U_FAILURE(status)) {
  207. delete engine;
  208. engine = NULL;
  209. }
  210. return engine;
  211. }
  212. }
  213. return NULL;
  214. }
  215. DictionaryMatcher *
  216. ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
  217. UErrorCode status = U_ZERO_ERROR;
  218. // open root from brkitr tree.
  219. UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
  220. b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
  221. int32_t dictnlength = 0;
  222. const UChar *dictfname =
  223. ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
  224. if (U_FAILURE(status)) {
  225. ures_close(b);
  226. return NULL;
  227. }
  228. CharString dictnbuf;
  229. CharString ext;
  230. const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
  231. if (extStart != NULL) {
  232. int32_t len = (int32_t)(extStart - dictfname);
  233. ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
  234. dictnlength = len;
  235. }
  236. dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
  237. ures_close(b);
  238. UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
  239. if (U_SUCCESS(status)) {
  240. // build trie
  241. const uint8_t *data = (const uint8_t *)udata_getMemory(file);
  242. const int32_t *indexes = (const int32_t *)data;
  243. const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
  244. const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
  245. DictionaryMatcher *m = NULL;
  246. if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
  247. const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
  248. const char *characters = (const char *)(data + offset);
  249. m = new BytesDictionaryMatcher(characters, transform, file);
  250. }
  251. else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
  252. const UChar *characters = (const UChar *)(data + offset);
  253. m = new UCharsDictionaryMatcher(characters, file);
  254. }
  255. if (m == NULL) {
  256. // no matcher exists to take ownership - either we are an invalid
  257. // type or memory allocation failed
  258. udata_close(file);
  259. }
  260. return m;
  261. } else if (dictfname != NULL) {
  262. // we don't have a dictionary matcher.
  263. // returning NULL here will cause us to fail to find a dictionary break engine, as expected
  264. status = U_ZERO_ERROR;
  265. return NULL;
  266. }
  267. return NULL;
  268. }
  269. U_NAMESPACE_END
  270. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */