ubrk.cpp 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ********************************************************************************
  5. * Copyright (C) 1996-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. ********************************************************************************
  8. */
  9. #include "unicode/utypes.h"
  10. #if !UCONFIG_NO_BREAK_ITERATION
  11. #include "unicode/ubrk.h"
  12. #include "unicode/brkiter.h"
  13. #include "unicode/uloc.h"
  14. #include "unicode/ustring.h"
  15. #include "unicode/uchriter.h"
  16. #include "unicode/rbbi.h"
  17. #include "rbbirb.h"
  18. #include "uassert.h"
  19. #include "cmemory.h"
  20. U_NAMESPACE_USE
  21. //------------------------------------------------------------------------------
  22. //
  23. // ubrk_open Create a canned type of break iterator based on type (word, line, etc.)
  24. // and locale.
  25. //
  26. //------------------------------------------------------------------------------
  27. U_CAPI UBreakIterator* U_EXPORT2
  28. ubrk_open(UBreakIteratorType type,
  29. const char *locale,
  30. const char16_t *text,
  31. int32_t textLength,
  32. UErrorCode *status)
  33. {
  34. if(U_FAILURE(*status)) return 0;
  35. BreakIterator *result = 0;
  36. switch(type) {
  37. case UBRK_CHARACTER:
  38. result = BreakIterator::createCharacterInstance(Locale(locale), *status);
  39. break;
  40. case UBRK_WORD:
  41. result = BreakIterator::createWordInstance(Locale(locale), *status);
  42. break;
  43. case UBRK_LINE:
  44. result = BreakIterator::createLineInstance(Locale(locale), *status);
  45. break;
  46. case UBRK_SENTENCE:
  47. result = BreakIterator::createSentenceInstance(Locale(locale), *status);
  48. break;
  49. case UBRK_TITLE:
  50. result = BreakIterator::createTitleInstance(Locale(locale), *status);
  51. break;
  52. default:
  53. *status = U_ILLEGAL_ARGUMENT_ERROR;
  54. }
  55. // check for allocation error
  56. if (U_FAILURE(*status)) {
  57. return 0;
  58. }
  59. if(result == 0) {
  60. *status = U_MEMORY_ALLOCATION_ERROR;
  61. return 0;
  62. }
  63. UBreakIterator *uBI = (UBreakIterator *)result;
  64. if (text != nullptr) {
  65. ubrk_setText(uBI, text, textLength, status);
  66. }
  67. return uBI;
  68. }
  69. //------------------------------------------------------------------------------
  70. //
  71. // ubrk_openRules open a break iterator from a set of break rules.
  72. // Invokes the rule builder.
  73. //
  74. //------------------------------------------------------------------------------
  75. U_CAPI UBreakIterator* U_EXPORT2
  76. ubrk_openRules( const char16_t *rules,
  77. int32_t rulesLength,
  78. const char16_t *text,
  79. int32_t textLength,
  80. UParseError *parseErr,
  81. UErrorCode *status) {
  82. if (status == nullptr || U_FAILURE(*status)){
  83. return 0;
  84. }
  85. BreakIterator *result = 0;
  86. UnicodeString ruleString(rules, rulesLength);
  87. result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, parseErr, *status);
  88. if(U_FAILURE(*status)) {
  89. return 0;
  90. }
  91. UBreakIterator *uBI = (UBreakIterator *)result;
  92. if (text != nullptr) {
  93. ubrk_setText(uBI, text, textLength, status);
  94. }
  95. return uBI;
  96. }
  97. U_CAPI UBreakIterator* U_EXPORT2
  98. ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
  99. const char16_t * text, int32_t textLength,
  100. UErrorCode * status)
  101. {
  102. if (U_FAILURE(*status)) {
  103. return nullptr;
  104. }
  105. if (rulesLength < 0) {
  106. *status = U_ILLEGAL_ARGUMENT_ERROR;
  107. return nullptr;
  108. }
  109. LocalPointer<RuleBasedBreakIterator> lpRBBI(new RuleBasedBreakIterator(binaryRules, rulesLength, *status), *status);
  110. if (U_FAILURE(*status)) {
  111. return nullptr;
  112. }
  113. UBreakIterator *uBI = reinterpret_cast<UBreakIterator *>(lpRBBI.orphan());
  114. if (text != nullptr) {
  115. ubrk_setText(uBI, text, textLength, status);
  116. }
  117. return uBI;
  118. }
  119. U_CAPI UBreakIterator * U_EXPORT2
  120. ubrk_safeClone(
  121. const UBreakIterator *bi,
  122. void * /*stackBuffer*/,
  123. int32_t *pBufferSize,
  124. UErrorCode *status)
  125. {
  126. if (status == nullptr || U_FAILURE(*status)){
  127. return nullptr;
  128. }
  129. if (bi == nullptr) {
  130. *status = U_ILLEGAL_ARGUMENT_ERROR;
  131. return nullptr;
  132. }
  133. if (pBufferSize != nullptr) {
  134. int32_t inputSize = *pBufferSize;
  135. *pBufferSize = 1;
  136. if (inputSize == 0) {
  137. return nullptr; // preflighting for deprecated functionality
  138. }
  139. }
  140. BreakIterator *newBI = ((BreakIterator *)bi)->clone();
  141. if (newBI == nullptr) {
  142. *status = U_MEMORY_ALLOCATION_ERROR;
  143. } else if (pBufferSize != nullptr) {
  144. *status = U_SAFECLONE_ALLOCATED_WARNING;
  145. }
  146. return (UBreakIterator *)newBI;
  147. }
  148. U_CAPI UBreakIterator * U_EXPORT2
  149. ubrk_clone(const UBreakIterator *bi, UErrorCode *status) {
  150. return ubrk_safeClone(bi, nullptr, nullptr, status);
  151. }
  152. U_CAPI void U_EXPORT2
  153. ubrk_close(UBreakIterator *bi)
  154. {
  155. delete (BreakIterator *)bi;
  156. }
  157. U_CAPI void U_EXPORT2
  158. ubrk_setText(UBreakIterator* bi,
  159. const char16_t* text,
  160. int32_t textLength,
  161. UErrorCode* status)
  162. {
  163. UText ut = UTEXT_INITIALIZER;
  164. utext_openUChars(&ut, text, textLength, status);
  165. ((BreakIterator*)bi)->setText(&ut, *status);
  166. // A stack allocated UText wrapping a char16_t * string
  167. // can be dumped without explicitly closing it.
  168. }
  169. U_CAPI void U_EXPORT2
  170. ubrk_setUText(UBreakIterator *bi,
  171. UText *text,
  172. UErrorCode *status)
  173. {
  174. ((BreakIterator*)bi)->setText(text, *status);
  175. }
  176. U_CAPI int32_t U_EXPORT2
  177. ubrk_current(const UBreakIterator *bi)
  178. {
  179. return ((BreakIterator*)bi)->current();
  180. }
  181. U_CAPI int32_t U_EXPORT2
  182. ubrk_next(UBreakIterator *bi)
  183. {
  184. return ((BreakIterator*)bi)->next();
  185. }
  186. U_CAPI int32_t U_EXPORT2
  187. ubrk_previous(UBreakIterator *bi)
  188. {
  189. return ((BreakIterator*)bi)->previous();
  190. }
  191. U_CAPI int32_t U_EXPORT2
  192. ubrk_first(UBreakIterator *bi)
  193. {
  194. return ((BreakIterator*)bi)->first();
  195. }
  196. U_CAPI int32_t U_EXPORT2
  197. ubrk_last(UBreakIterator *bi)
  198. {
  199. return ((BreakIterator*)bi)->last();
  200. }
  201. U_CAPI int32_t U_EXPORT2
  202. ubrk_preceding(UBreakIterator *bi,
  203. int32_t offset)
  204. {
  205. return ((BreakIterator*)bi)->preceding(offset);
  206. }
  207. U_CAPI int32_t U_EXPORT2
  208. ubrk_following(UBreakIterator *bi,
  209. int32_t offset)
  210. {
  211. return ((BreakIterator*)bi)->following(offset);
  212. }
  213. U_CAPI const char* U_EXPORT2
  214. ubrk_getAvailable(int32_t index)
  215. {
  216. return uloc_getAvailable(index);
  217. }
  218. U_CAPI int32_t U_EXPORT2
  219. ubrk_countAvailable()
  220. {
  221. return uloc_countAvailable();
  222. }
  223. U_CAPI UBool U_EXPORT2
  224. ubrk_isBoundary(UBreakIterator *bi, int32_t offset)
  225. {
  226. return ((BreakIterator*)bi)->isBoundary(offset);
  227. }
  228. U_CAPI int32_t U_EXPORT2
  229. ubrk_getRuleStatus(UBreakIterator *bi)
  230. {
  231. return ((BreakIterator*)bi)->getRuleStatus();
  232. }
  233. U_CAPI int32_t U_EXPORT2
  234. ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status)
  235. {
  236. return ((BreakIterator*)bi)->getRuleStatusVec(fillInVec, capacity, *status);
  237. }
  238. U_CAPI const char* U_EXPORT2
  239. ubrk_getLocaleByType(const UBreakIterator *bi,
  240. ULocDataLocaleType type,
  241. UErrorCode* status)
  242. {
  243. if (bi == nullptr) {
  244. if (U_SUCCESS(*status)) {
  245. *status = U_ILLEGAL_ARGUMENT_ERROR;
  246. }
  247. return nullptr;
  248. }
  249. return ((BreakIterator*)bi)->getLocaleID(type, *status);
  250. }
  251. U_CAPI void U_EXPORT2
  252. ubrk_refreshUText(UBreakIterator *bi,
  253. UText *text,
  254. UErrorCode *status)
  255. {
  256. BreakIterator *bii = reinterpret_cast<BreakIterator *>(bi);
  257. bii->refreshInputText(text, *status);
  258. }
  259. U_CAPI int32_t U_EXPORT2
  260. ubrk_getBinaryRules(UBreakIterator *bi,
  261. uint8_t * binaryRules, int32_t rulesCapacity,
  262. UErrorCode * status)
  263. {
  264. if (U_FAILURE(*status)) {
  265. return 0;
  266. }
  267. if ((binaryRules == nullptr && rulesCapacity > 0) || rulesCapacity < 0) {
  268. *status = U_ILLEGAL_ARGUMENT_ERROR;
  269. return 0;
  270. }
  271. RuleBasedBreakIterator* rbbi;
  272. if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == nullptr) {
  273. *status = U_ILLEGAL_ARGUMENT_ERROR;
  274. return 0;
  275. }
  276. uint32_t rulesLength;
  277. const uint8_t * returnedRules = rbbi->getBinaryRules(rulesLength);
  278. if (rulesLength > INT32_MAX) {
  279. *status = U_INDEX_OUTOFBOUNDS_ERROR;
  280. return 0;
  281. }
  282. if (binaryRules != nullptr) { // if not preflighting
  283. // Here we know rulesLength <= INT32_MAX and rulesCapacity >= 0, can cast safely
  284. if ((int32_t)rulesLength > rulesCapacity) {
  285. *status = U_BUFFER_OVERFLOW_ERROR;
  286. } else {
  287. uprv_memcpy(binaryRules, returnedRules, rulesLength);
  288. }
  289. }
  290. return (int32_t)rulesLength;
  291. }
  292. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */