brkeng.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ************************************************************************************
  5. * Copyright (C) 2006-2016, International Business Machines Corporation
  6. * and others. All Rights Reserved.
  7. ************************************************************************************
  8. */
  9. #include "unicode/utypes.h"
  10. #if !UCONFIG_NO_BREAK_ITERATION
  11. #include "unicode/uchar.h"
  12. #include "unicode/uniset.h"
  13. #include "unicode/chariter.h"
  14. #include "unicode/ures.h"
  15. #include "unicode/udata.h"
  16. #include "unicode/putil.h"
  17. #include "unicode/ustring.h"
  18. #include "unicode/uscript.h"
  19. #include "unicode/ucharstrie.h"
  20. #include "unicode/bytestrie.h"
  21. #include "unicode/rbbi.h"
  22. #include "brkeng.h"
  23. #include "cmemory.h"
  24. #include "dictbe.h"
  25. #include "lstmbe.h"
  26. #include "charstr.h"
  27. #include "dictionarydata.h"
  28. #include "mutex.h"
  29. #include "uvector.h"
  30. #include "umutex.h"
  31. #include "uresimp.h"
  32. #include "ubrkimpl.h"
  33. U_NAMESPACE_BEGIN
  34. /*
  35. ******************************************************************
  36. */
  37. LanguageBreakEngine::LanguageBreakEngine() {
  38. }
  39. LanguageBreakEngine::~LanguageBreakEngine() {
  40. }
  41. /*
  42. ******************************************************************
  43. */
  44. LanguageBreakFactory::LanguageBreakFactory() {
  45. }
  46. LanguageBreakFactory::~LanguageBreakFactory() {
  47. }
  48. /*
  49. ******************************************************************
  50. */
  51. UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
  52. (void)status;
  53. }
  54. UnhandledEngine::~UnhandledEngine() {
  55. delete fHandled;
  56. fHandled = nullptr;
  57. }
  58. UBool
  59. UnhandledEngine::handles(UChar32 c, const char* locale) const {
  60. (void)locale; // Unused
  61. return fHandled && fHandled->contains(c);
  62. }
  63. int32_t
  64. UnhandledEngine::findBreaks( UText *text,
  65. int32_t startPos,
  66. int32_t endPos,
  67. UVector32 &/*foundBreaks*/,
  68. UBool /* isPhraseBreaking */,
  69. UErrorCode &status) const {
  70. if (U_FAILURE(status)) return 0;
  71. utext_setNativeIndex(text, startPos);
  72. UChar32 c = utext_current32(text);
  73. while (static_cast<int32_t>(utext_getNativeIndex(text)) < endPos && fHandled->contains(c)) {
  74. utext_next32(text); // TODO: recast loop to work with post-increment operations.
  75. c = utext_current32(text);
  76. }
  77. return 0;
  78. }
  79. void
  80. UnhandledEngine::handleCharacter(UChar32 c) {
  81. if (fHandled == nullptr) {
  82. fHandled = new UnicodeSet();
  83. if (fHandled == nullptr) {
  84. return;
  85. }
  86. }
  87. if (!fHandled->contains(c)) {
  88. UErrorCode status = U_ZERO_ERROR;
  89. // Apply the entire script of the character.
  90. int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
  91. fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
  92. }
  93. }
  94. /*
  95. ******************************************************************
  96. */
  97. ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
  98. fEngines = nullptr;
  99. }
  100. ICULanguageBreakFactory::~ICULanguageBreakFactory() {
  101. delete fEngines;
  102. }
  103. void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
  104. static UMutex gBreakEngineMutex;
  105. Mutex m(&gBreakEngineMutex);
  106. if (fEngines == nullptr) {
  107. LocalPointer<UStack> engines(new UStack(uprv_deleteUObject, nullptr, status), status);
  108. if (U_SUCCESS(status)) {
  109. fEngines = engines.orphan();
  110. }
  111. }
  112. }
  113. const LanguageBreakEngine *
  114. ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
  115. const LanguageBreakEngine *lbe = nullptr;
  116. UErrorCode status = U_ZERO_ERROR;
  117. ensureEngines(status);
  118. if (U_FAILURE(status) ) {
  119. // Note: no way to return error code to caller.
  120. return nullptr;
  121. }
  122. static UMutex gBreakEngineMutex;
  123. Mutex m(&gBreakEngineMutex);
  124. int32_t i = fEngines->size();
  125. while (--i >= 0) {
  126. lbe = static_cast<const LanguageBreakEngine*>(fEngines->elementAt(i));
  127. if (lbe != nullptr && lbe->handles(c, locale)) {
  128. return lbe;
  129. }
  130. }
  131. // We didn't find an engine. Create one.
  132. lbe = loadEngineFor(c, locale);
  133. if (lbe != nullptr) {
  134. fEngines->push((void *)lbe, status);
  135. }
  136. return U_SUCCESS(status) ? lbe : nullptr;
  137. }
  138. const LanguageBreakEngine *
  139. ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
  140. UErrorCode status = U_ZERO_ERROR;
  141. UScriptCode code = uscript_getScript(c, &status);
  142. if (U_SUCCESS(status)) {
  143. const LanguageBreakEngine *engine = nullptr;
  144. // Try to use LSTM first
  145. const LSTMData *data = CreateLSTMDataForScript(code, status);
  146. if (U_SUCCESS(status)) {
  147. if (data != nullptr) {
  148. engine = CreateLSTMBreakEngine(code, data, status);
  149. if (U_SUCCESS(status) && engine != nullptr) {
  150. return engine;
  151. }
  152. if (engine != nullptr) {
  153. delete engine;
  154. engine = nullptr;
  155. } else {
  156. DeleteLSTMData(data);
  157. }
  158. }
  159. }
  160. status = U_ZERO_ERROR; // fallback to dictionary based
  161. DictionaryMatcher *m = loadDictionaryMatcherFor(code);
  162. if (m != nullptr) {
  163. switch(code) {
  164. case USCRIPT_THAI:
  165. engine = new ThaiBreakEngine(m, status);
  166. break;
  167. case USCRIPT_LAO:
  168. engine = new LaoBreakEngine(m, status);
  169. break;
  170. case USCRIPT_MYANMAR:
  171. engine = new BurmeseBreakEngine(m, status);
  172. break;
  173. case USCRIPT_KHMER:
  174. engine = new KhmerBreakEngine(m, status);
  175. break;
  176. #if !UCONFIG_NO_NORMALIZATION
  177. // CJK not available w/o normalization
  178. case USCRIPT_HANGUL:
  179. engine = new CjkBreakEngine(m, kKorean, status);
  180. break;
  181. // use same BreakEngine and dictionary for both Chinese and Japanese
  182. case USCRIPT_HIRAGANA:
  183. case USCRIPT_KATAKANA:
  184. case USCRIPT_HAN:
  185. engine = new CjkBreakEngine(m, kChineseJapanese, status);
  186. break;
  187. #if 0
  188. // TODO: Have to get some characters with script=common handled
  189. // by CjkBreakEngine (e.g. U+309B). Simply subjecting
  190. // them to CjkBreakEngine does not work. The engine has to
  191. // special-case them.
  192. case USCRIPT_COMMON:
  193. {
  194. UBlockCode block = ublock_getCode(code);
  195. if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
  196. engine = new CjkBreakEngine(dict, kChineseJapanese, status);
  197. break;
  198. }
  199. #endif
  200. #endif
  201. default:
  202. break;
  203. }
  204. if (engine == nullptr) {
  205. delete m;
  206. }
  207. else if (U_FAILURE(status)) {
  208. delete engine;
  209. engine = nullptr;
  210. }
  211. return engine;
  212. }
  213. }
  214. return nullptr;
  215. }
  216. DictionaryMatcher *
  217. ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
  218. UErrorCode status = U_ZERO_ERROR;
  219. // open root from brkitr tree.
  220. UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
  221. b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
  222. int32_t dictnlength = 0;
  223. const char16_t *dictfname =
  224. ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
  225. if (U_FAILURE(status)) {
  226. ures_close(b);
  227. return nullptr;
  228. }
  229. CharString dictnbuf;
  230. CharString ext;
  231. const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
  232. if (extStart != nullptr) {
  233. int32_t len = static_cast<int32_t>(extStart - dictfname);
  234. ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
  235. dictnlength = len;
  236. }
  237. dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
  238. ures_close(b);
  239. UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
  240. if (U_SUCCESS(status)) {
  241. // build trie
  242. const uint8_t* data = static_cast<const uint8_t*>(udata_getMemory(file));
  243. const int32_t* indexes = reinterpret_cast<const int32_t*>(data);
  244. const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
  245. const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
  246. DictionaryMatcher *m = nullptr;
  247. if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
  248. const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
  249. const char* characters = reinterpret_cast<const char*>(data + offset);
  250. m = new BytesDictionaryMatcher(characters, transform, file);
  251. }
  252. else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
  253. const char16_t* characters = reinterpret_cast<const char16_t*>(data + offset);
  254. m = new UCharsDictionaryMatcher(characters, file);
  255. }
  256. if (m == nullptr) {
  257. // no matcher exists to take ownership - either we are an invalid
  258. // type or memory allocation failed
  259. udata_close(file);
  260. }
  261. return m;
  262. } else if (dictfname != nullptr) {
  263. // we don't have a dictionary matcher.
  264. // returning nullptr here will cause us to fail to find a dictionary break engine, as expected
  265. status = U_ZERO_ERROR;
  266. return nullptr;
  267. }
  268. return nullptr;
  269. }
  270. void ICULanguageBreakFactory::addExternalEngine(
  271. ExternalBreakEngine* external, UErrorCode& status) {
  272. LocalPointer<ExternalBreakEngine> engine(external, status);
  273. ensureEngines(status);
  274. LocalPointer<BreakEngineWrapper> wrapper(
  275. new BreakEngineWrapper(engine.orphan(), status), status);
  276. static UMutex gBreakEngineMutex;
  277. Mutex m(&gBreakEngineMutex);
  278. fEngines->push(wrapper.getAlias(), status);
  279. wrapper.orphan();
  280. }
  281. BreakEngineWrapper::BreakEngineWrapper(
  282. ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
  283. }
  284. BreakEngineWrapper::~BreakEngineWrapper() {
  285. }
  286. UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
  287. return delegate->isFor(c, locale);
  288. }
  289. int32_t BreakEngineWrapper::findBreaks(
  290. UText *text,
  291. int32_t startPos,
  292. int32_t endPos,
  293. UVector32 &foundBreaks,
  294. UBool /* isPhraseBreaking */,
  295. UErrorCode &status) const {
  296. if (U_FAILURE(status)) return 0;
  297. int32_t result = 0;
  298. // Find the span of characters included in the set.
  299. // The span to break begins at the current position in the text, and
  300. // extends towards the start or end of the text, depending on 'reverse'.
  301. utext_setNativeIndex(text, startPos);
  302. int32_t start = static_cast<int32_t>(utext_getNativeIndex(text));
  303. int32_t current;
  304. int32_t rangeStart;
  305. int32_t rangeEnd;
  306. UChar32 c = utext_current32(text);
  307. while ((current = static_cast<int32_t>(utext_getNativeIndex(text))) < endPos && delegate->handles(c)) {
  308. utext_next32(text); // TODO: recast loop for postincrement
  309. c = utext_current32(text);
  310. }
  311. rangeStart = start;
  312. rangeEnd = current;
  313. int32_t beforeSize = foundBreaks.size();
  314. int32_t additionalCapacity = rangeEnd - rangeStart + 1;
  315. // enlarge to contains (rangeEnd-rangeStart+1) more items
  316. foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
  317. if (U_FAILURE(status)) return 0;
  318. foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
  319. result = delegate->fillBreaks(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
  320. additionalCapacity, status);
  321. if (U_FAILURE(status)) return 0;
  322. foundBreaks.setSize(beforeSize + result);
  323. utext_setNativeIndex(text, current);
  324. return result;
  325. }
  326. U_NAMESPACE_END
  327. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */