brkiter.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 1997-2015, International Business Machines Corporation and
  6. * others. All Rights Reserved.
  7. *******************************************************************************
  8. *
  9. * File brkiter.cpp
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 02/18/97 aliu Converted from OpenClass. Added DONE.
  15. * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
  16. *****************************************************************************************
  17. */
  18. // *****************************************************************************
  19. // This file was generated from the java source file BreakIterator.java
  20. // *****************************************************************************
  21. #include "unicode/utypes.h"
  22. #if !UCONFIG_NO_BREAK_ITERATION
  23. #include "unicode/rbbi.h"
  24. #include "unicode/brkiter.h"
  25. #include "unicode/udata.h"
  26. #include "unicode/uloc.h"
  27. #include "unicode/ures.h"
  28. #include "unicode/ustring.h"
  29. #include "unicode/filteredbrk.h"
  30. #include "bytesinkutil.h"
  31. #include "ucln_cmn.h"
  32. #include "cstring.h"
  33. #include "umutex.h"
  34. #include "servloc.h"
  35. #include "locbased.h"
  36. #include "uresimp.h"
  37. #include "uassert.h"
  38. #include "ubrkimpl.h"
  39. #include "utracimp.h"
  40. #include "charstr.h"
  41. // *****************************************************************************
  42. // class BreakIterator
  43. // This class implements methods for finding the location of boundaries in text.
  44. // Instances of BreakIterator maintain a current position and scan over text
  45. // returning the index of characters where boundaries occur.
  46. // *****************************************************************************
  47. U_NAMESPACE_BEGIN
  48. // -------------------------------------
  49. BreakIterator*
  50. BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
  51. {
  52. char fnbuff[256];
  53. char ext[4]={'\0'};
  54. CharString actualLocale;
  55. int32_t size;
  56. const char16_t* brkfname = nullptr;
  57. UResourceBundle brkRulesStack;
  58. UResourceBundle brkNameStack;
  59. UResourceBundle *brkRules = &brkRulesStack;
  60. UResourceBundle *brkName = &brkNameStack;
  61. RuleBasedBreakIterator *result = nullptr;
  62. if (U_FAILURE(status))
  63. return nullptr;
  64. ures_initStackObject(brkRules);
  65. ures_initStackObject(brkName);
  66. // Get the locale
  67. UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);
  68. // Get the "boundaries" array.
  69. if (U_SUCCESS(status)) {
  70. brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
  71. // Get the string object naming the rules file
  72. brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
  73. // Get the actual string
  74. brkfname = ures_getString(brkName, &size, &status);
  75. U_ASSERT((size_t)size<sizeof(fnbuff));
  76. if (static_cast<size_t>(size) >= sizeof(fnbuff)) {
  77. size=0;
  78. if (U_SUCCESS(status)) {
  79. status = U_BUFFER_OVERFLOW_ERROR;
  80. }
  81. }
  82. // Use the string if we found it
  83. if (U_SUCCESS(status) && brkfname) {
  84. actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status);
  85. char16_t* extStart=u_strchr(brkfname, 0x002e);
  86. int len = 0;
  87. if (extStart != nullptr){
  88. len = static_cast<int>(extStart - brkfname);
  89. u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
  90. u_UCharsToChars(brkfname, fnbuff, len);
  91. }
  92. fnbuff[len]=0; // nul terminate
  93. }
  94. }
  95. ures_close(brkRules);
  96. ures_close(brkName);
  97. UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
  98. if (U_FAILURE(status)) {
  99. ures_close(b);
  100. return nullptr;
  101. }
  102. // Create a RuleBasedBreakIterator
  103. result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
  104. // If there is a result, set the valid locale and actual locale, and the kind
  105. if (U_SUCCESS(status) && result != nullptr) {
  106. U_LOCALE_BASED(locBased, *(BreakIterator*)result);
  107. locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
  108. actualLocale.data());
  109. uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
  110. result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
  111. }
  112. ures_close(b);
  113. if (U_FAILURE(status) && result != nullptr) { // Sometimes redundant check, but simple
  114. delete result;
  115. return nullptr;
  116. }
  117. if (result == nullptr) {
  118. udata_close(file);
  119. if (U_SUCCESS(status)) {
  120. status = U_MEMORY_ALLOCATION_ERROR;
  121. }
  122. }
  123. return result;
  124. }
  125. // Creates a break iterator for word breaks.
  126. BreakIterator* U_EXPORT2
  127. BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
  128. {
  129. return createInstance(key, UBRK_WORD, status);
  130. }
  131. // -------------------------------------
  132. // Creates a break iterator for line breaks.
  133. BreakIterator* U_EXPORT2
  134. BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
  135. {
  136. return createInstance(key, UBRK_LINE, status);
  137. }
  138. // -------------------------------------
  139. // Creates a break iterator for character breaks.
  140. BreakIterator* U_EXPORT2
  141. BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
  142. {
  143. return createInstance(key, UBRK_CHARACTER, status);
  144. }
  145. // -------------------------------------
  146. // Creates a break iterator for sentence breaks.
  147. BreakIterator* U_EXPORT2
  148. BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
  149. {
  150. return createInstance(key, UBRK_SENTENCE, status);
  151. }
  152. // -------------------------------------
  153. // Creates a break iterator for title casing breaks.
  154. BreakIterator* U_EXPORT2
  155. BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
  156. {
  157. return createInstance(key, UBRK_TITLE, status);
  158. }
  159. // -------------------------------------
  160. // Gets all the available locales that has localized text boundary data.
  161. const Locale* U_EXPORT2
  162. BreakIterator::getAvailableLocales(int32_t& count)
  163. {
  164. return Locale::getAvailableLocales(count);
  165. }
  166. // ------------------------------------------
  167. //
  168. // Constructors, destructor and assignment operator
  169. //
  170. //-------------------------------------------
  171. BreakIterator::BreakIterator()
  172. {
  173. *validLocale = *actualLocale = *requestLocale = 0;
  174. }
  175. BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
  176. uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
  177. uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
  178. uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
  179. }
  180. BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
  181. if (this != &other) {
  182. uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
  183. uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
  184. uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
  185. }
  186. return *this;
  187. }
  188. BreakIterator::~BreakIterator()
  189. {
  190. }
  191. // ------------------------------------------
  192. //
  193. // Registration
  194. //
  195. //-------------------------------------------
  196. #if !UCONFIG_NO_SERVICE
  197. // -------------------------------------
  198. class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
  199. public:
  200. virtual ~ICUBreakIteratorFactory();
  201. protected:
  202. virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const override {
  203. return BreakIterator::makeInstance(loc, kind, status);
  204. }
  205. };
  206. ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}
  207. // -------------------------------------
  208. class ICUBreakIteratorService : public ICULocaleService {
  209. public:
  210. ICUBreakIteratorService()
  211. : ICULocaleService(UNICODE_STRING("Break Iterator", 14))
  212. {
  213. UErrorCode status = U_ZERO_ERROR;
  214. registerFactory(new ICUBreakIteratorFactory(), status);
  215. }
  216. virtual ~ICUBreakIteratorService();
  217. virtual UObject* cloneInstance(UObject* instance) const override {
  218. return ((BreakIterator*)instance)->clone();
  219. }
  220. virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const override {
  221. LocaleKey& lkey = static_cast<LocaleKey&>(const_cast<ICUServiceKey&>(key));
  222. int32_t kind = lkey.kind();
  223. Locale loc;
  224. lkey.currentLocale(loc);
  225. return BreakIterator::makeInstance(loc, kind, status);
  226. }
  227. virtual UBool isDefault() const override {
  228. return countFactories() == 1;
  229. }
  230. };
  231. ICUBreakIteratorService::~ICUBreakIteratorService() {}
  232. // -------------------------------------
  233. // defined in ucln_cmn.h
  234. U_NAMESPACE_END
  235. static icu::UInitOnce gInitOnceBrkiter {};
  236. static icu::ICULocaleService* gService = nullptr;
  237. /**
  238. * Release all static memory held by breakiterator.
  239. */
  240. U_CDECL_BEGIN
  241. static UBool U_CALLCONV breakiterator_cleanup() {
  242. #if !UCONFIG_NO_SERVICE
  243. if (gService) {
  244. delete gService;
  245. gService = nullptr;
  246. }
  247. gInitOnceBrkiter.reset();
  248. #endif
  249. return true;
  250. }
  251. U_CDECL_END
  252. U_NAMESPACE_BEGIN
  253. static void U_CALLCONV
  254. initService() {
  255. gService = new ICUBreakIteratorService();
  256. ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);
  257. }
  258. static ICULocaleService*
  259. getService()
  260. {
  261. umtx_initOnce(gInitOnceBrkiter, &initService);
  262. return gService;
  263. }
  264. // -------------------------------------
  265. static inline UBool
  266. hasService()
  267. {
  268. return !gInitOnceBrkiter.isReset() && getService() != nullptr;
  269. }
  270. // -------------------------------------
  271. URegistryKey U_EXPORT2
  272. BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
  273. {
  274. ICULocaleService *service = getService();
  275. if (service == nullptr) {
  276. status = U_MEMORY_ALLOCATION_ERROR;
  277. return nullptr;
  278. }
  279. return service->registerInstance(toAdopt, locale, kind, status);
  280. }
  281. // -------------------------------------
  282. UBool U_EXPORT2
  283. BreakIterator::unregister(URegistryKey key, UErrorCode& status)
  284. {
  285. if (U_SUCCESS(status)) {
  286. if (hasService()) {
  287. return gService->unregister(key, status);
  288. }
  289. status = U_MEMORY_ALLOCATION_ERROR;
  290. }
  291. return false;
  292. }
  293. // -------------------------------------
  294. StringEnumeration* U_EXPORT2
  295. BreakIterator::getAvailableLocales()
  296. {
  297. ICULocaleService *service = getService();
  298. if (service == nullptr) {
  299. return nullptr;
  300. }
  301. return service->getAvailableLocales();
  302. }
  303. #endif /* UCONFIG_NO_SERVICE */
  304. // -------------------------------------
  305. BreakIterator*
  306. BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
  307. {
  308. if (U_FAILURE(status)) {
  309. return nullptr;
  310. }
  311. #if !UCONFIG_NO_SERVICE
  312. if (hasService()) {
  313. Locale actualLoc("");
  314. BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
  315. // TODO: The way the service code works in ICU 2.8 is that if
  316. // there is a real registered break iterator, the actualLoc
  317. // will be populated, but if the handleDefault path is taken
  318. // (because nothing is registered that can handle the
  319. // requested locale) then the actualLoc comes back empty. In
  320. // that case, the returned object already has its actual/valid
  321. // locale data populated (by makeInstance, which is what
  322. // handleDefault calls), so we don't touch it. YES, A COMMENT
  323. // THIS LONG is a sign of bad code -- so the action item is to
  324. // revisit this in ICU 3.0 and clean it up/fix it/remove it.
  325. if (U_SUCCESS(status) && (result != nullptr) && *actualLoc.getName() != 0) {
  326. U_LOCALE_BASED(locBased, *result);
  327. locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
  328. }
  329. return result;
  330. }
  331. else
  332. #endif
  333. {
  334. return makeInstance(loc, kind, status);
  335. }
  336. }
  337. // -------------------------------------
  338. enum { kKeyValueLenMax = 32 };
  339. BreakIterator*
  340. BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
  341. {
  342. if (U_FAILURE(status)) {
  343. return nullptr;
  344. }
  345. BreakIterator *result = nullptr;
  346. switch (kind) {
  347. case UBRK_CHARACTER:
  348. {
  349. UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
  350. result = BreakIterator::buildInstance(loc, "grapheme", status);
  351. UTRACE_EXIT_STATUS(status);
  352. }
  353. break;
  354. case UBRK_WORD:
  355. {
  356. UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
  357. result = BreakIterator::buildInstance(loc, "word", status);
  358. UTRACE_EXIT_STATUS(status);
  359. }
  360. break;
  361. case UBRK_LINE:
  362. {
  363. char lb_lw[kKeyValueLenMax];
  364. UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
  365. uprv_strcpy(lb_lw, "line");
  366. UErrorCode kvStatus = U_ZERO_ERROR;
  367. auto value = loc.getKeywordValue<CharString>("lb", kvStatus);
  368. if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
  369. uprv_strcat(lb_lw, "_");
  370. uprv_strcat(lb_lw, value.data());
  371. }
  372. // lw=phrase is only supported in Japanese and Korean
  373. if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
  374. value = loc.getKeywordValue<CharString>("lw", kvStatus);
  375. if (U_SUCCESS(kvStatus) && value == "phrase") {
  376. uprv_strcat(lb_lw, "_");
  377. uprv_strcat(lb_lw, value.data());
  378. }
  379. }
  380. result = BreakIterator::buildInstance(loc, lb_lw, status);
  381. UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
  382. UTRACE_EXIT_STATUS(status);
  383. }
  384. break;
  385. case UBRK_SENTENCE:
  386. {
  387. UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
  388. result = BreakIterator::buildInstance(loc, "sentence", status);
  389. #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  390. char ssKeyValue[kKeyValueLenMax] = {0};
  391. UErrorCode kvStatus = U_ZERO_ERROR;
  392. int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
  393. if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
  394. FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
  395. if (U_SUCCESS(kvStatus)) {
  396. result = fbiBuilder->build(result, status);
  397. delete fbiBuilder;
  398. }
  399. }
  400. #endif
  401. UTRACE_EXIT_STATUS(status);
  402. }
  403. break;
  404. case UBRK_TITLE:
  405. {
  406. UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
  407. result = BreakIterator::buildInstance(loc, "title", status);
  408. UTRACE_EXIT_STATUS(status);
  409. }
  410. break;
  411. default:
  412. status = U_ILLEGAL_ARGUMENT_ERROR;
  413. }
  414. if (U_FAILURE(status)) {
  415. return nullptr;
  416. }
  417. return result;
  418. }
  419. Locale
  420. BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
  421. if (type == ULOC_REQUESTED_LOCALE) {
  422. return {requestLocale};
  423. }
  424. U_LOCALE_BASED(locBased, *this);
  425. return locBased.getLocale(type, status);
  426. }
  427. const char *
  428. BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
  429. if (type == ULOC_REQUESTED_LOCALE) {
  430. return requestLocale;
  431. }
  432. U_LOCALE_BASED(locBased, *this);
  433. return locBased.getLocaleID(type, status);
  434. }
  435. // This implementation of getRuleStatus is a do-nothing stub, here to
  436. // provide a default implementation for any derived BreakIterator classes that
  437. // do not implement it themselves.
  438. int32_t BreakIterator::getRuleStatus() const {
  439. return 0;
  440. }
  441. // This implementation of getRuleStatusVec is a do-nothing stub, here to
  442. // provide a default implementation for any derived BreakIterator classes that
  443. // do not implement it themselves.
  444. int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
  445. if (U_FAILURE(status)) {
  446. return 0;
  447. }
  448. if (capacity < 1) {
  449. status = U_BUFFER_OVERFLOW_ERROR;
  450. return 1;
  451. }
  452. *fillInVec = 0;
  453. return 1;
  454. }
  455. BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
  456. U_LOCALE_BASED(locBased, (*this));
  457. locBased.setLocaleIDs(valid, actual);
  458. }
  459. U_NAMESPACE_END
  460. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  461. //eof