normlzr.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *************************************************************************
  5. * COPYRIGHT:
  6. * Copyright (c) 1996-2012, International Business Machines Corporation and
  7. * others. All Rights Reserved.
  8. *************************************************************************
  9. */
  10. #include "unicode/utypes.h"
  11. #if !UCONFIG_NO_NORMALIZATION
  12. #include "unicode/uniset.h"
  13. #include "unicode/unistr.h"
  14. #include "unicode/chariter.h"
  15. #include "unicode/schriter.h"
  16. #include "unicode/uchriter.h"
  17. #include "unicode/normlzr.h"
  18. #include "unicode/utf16.h"
  19. #include "cmemory.h"
  20. #include "normalizer2impl.h"
  21. #include "uprops.h" // for uniset_getUnicode32Instance()
  22. #if defined(move32)
  23. // System can define move32 intrinsics, but the char iters define move32 method
  24. // using same undef trick in headers, so undef here to re-enable the method.
  25. #undef move32
  26. #endif
  27. U_NAMESPACE_BEGIN
  28. UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
  29. //-------------------------------------------------------------------------
  30. // Constructors and other boilerplate
  31. //-------------------------------------------------------------------------
  32. Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
  33. UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
  34. text(new StringCharacterIterator(str)),
  35. currentIndex(0), nextIndex(0),
  36. buffer(), bufferPos(0)
  37. {
  38. init();
  39. }
  40. Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
  41. UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
  42. text(new UCharCharacterIterator(str, length)),
  43. currentIndex(0), nextIndex(0),
  44. buffer(), bufferPos(0)
  45. {
  46. init();
  47. }
  48. Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
  49. UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
  50. text(iter.clone()),
  51. currentIndex(0), nextIndex(0),
  52. buffer(), bufferPos(0)
  53. {
  54. init();
  55. }
  56. Normalizer::Normalizer(const Normalizer &copy) :
  57. UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions),
  58. text(copy.text->clone()),
  59. currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
  60. buffer(copy.buffer), bufferPos(copy.bufferPos)
  61. {
  62. init();
  63. }
  64. void
  65. Normalizer::init() {
  66. UErrorCode errorCode=U_ZERO_ERROR;
  67. fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
  68. if(fOptions&UNORM_UNICODE_3_2) {
  69. delete fFilteredNorm2;
  70. fNorm2=fFilteredNorm2=
  71. new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
  72. }
  73. if(U_FAILURE(errorCode)) {
  74. errorCode=U_ZERO_ERROR;
  75. fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
  76. }
  77. }
  78. Normalizer::~Normalizer()
  79. {
  80. delete fFilteredNorm2;
  81. delete text;
  82. }
  83. Normalizer*
  84. Normalizer::clone() const
  85. {
  86. return new Normalizer(*this);
  87. }
  88. /**
  89. * Generates a hash code for this iterator.
  90. */
  91. int32_t Normalizer::hashCode() const
  92. {
  93. return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
  94. }
  95. bool Normalizer::operator==(const Normalizer& that) const
  96. {
  97. return
  98. this==&that ||
  99. (fUMode==that.fUMode &&
  100. fOptions==that.fOptions &&
  101. *text==*that.text &&
  102. buffer==that.buffer &&
  103. bufferPos==that.bufferPos &&
  104. nextIndex==that.nextIndex);
  105. }
  106. //-------------------------------------------------------------------------
  107. // Static utility methods
  108. //-------------------------------------------------------------------------
  109. void U_EXPORT2
  110. Normalizer::normalize(const UnicodeString& source,
  111. UNormalizationMode mode, int32_t options,
  112. UnicodeString& result,
  113. UErrorCode &status) {
  114. if(source.isBogus() || U_FAILURE(status)) {
  115. result.setToBogus();
  116. if(U_SUCCESS(status)) {
  117. status=U_ILLEGAL_ARGUMENT_ERROR;
  118. }
  119. } else {
  120. UnicodeString localDest;
  121. UnicodeString *dest;
  122. if(&source!=&result) {
  123. dest=&result;
  124. } else {
  125. // the source and result strings are the same object, use a temporary one
  126. dest=&localDest;
  127. }
  128. const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
  129. if(U_SUCCESS(status)) {
  130. if(options&UNORM_UNICODE_3_2) {
  131. FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
  132. normalize(source, *dest, status);
  133. } else {
  134. n2->normalize(source, *dest, status);
  135. }
  136. }
  137. if(dest==&localDest && U_SUCCESS(status)) {
  138. result=*dest;
  139. }
  140. }
  141. }
  142. void U_EXPORT2
  143. Normalizer::compose(const UnicodeString& source,
  144. UBool compat, int32_t options,
  145. UnicodeString& result,
  146. UErrorCode &status) {
  147. normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
  148. }
  149. void U_EXPORT2
  150. Normalizer::decompose(const UnicodeString& source,
  151. UBool compat, int32_t options,
  152. UnicodeString& result,
  153. UErrorCode &status) {
  154. normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
  155. }
  156. UNormalizationCheckResult
  157. Normalizer::quickCheck(const UnicodeString& source,
  158. UNormalizationMode mode, int32_t options,
  159. UErrorCode &status) {
  160. const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
  161. if(U_SUCCESS(status)) {
  162. if(options&UNORM_UNICODE_3_2) {
  163. return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
  164. quickCheck(source, status);
  165. } else {
  166. return n2->quickCheck(source, status);
  167. }
  168. } else {
  169. return UNORM_MAYBE;
  170. }
  171. }
  172. UBool
  173. Normalizer::isNormalized(const UnicodeString& source,
  174. UNormalizationMode mode, int32_t options,
  175. UErrorCode &status) {
  176. const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
  177. if(U_SUCCESS(status)) {
  178. if(options&UNORM_UNICODE_3_2) {
  179. return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
  180. isNormalized(source, status);
  181. } else {
  182. return n2->isNormalized(source, status);
  183. }
  184. } else {
  185. return false;
  186. }
  187. }
  188. UnicodeString & U_EXPORT2
  189. Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
  190. UnicodeString &result,
  191. UNormalizationMode mode, int32_t options,
  192. UErrorCode &errorCode) {
  193. if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
  194. result.setToBogus();
  195. if(U_SUCCESS(errorCode)) {
  196. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  197. }
  198. } else {
  199. UnicodeString localDest;
  200. UnicodeString *dest;
  201. if(&right!=&result) {
  202. dest=&result;
  203. } else {
  204. // the right and result strings are the same object, use a temporary one
  205. dest=&localDest;
  206. }
  207. *dest=left;
  208. const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
  209. if(U_SUCCESS(errorCode)) {
  210. if(options&UNORM_UNICODE_3_2) {
  211. FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
  212. append(*dest, right, errorCode);
  213. } else {
  214. n2->append(*dest, right, errorCode);
  215. }
  216. }
  217. if(dest==&localDest && U_SUCCESS(errorCode)) {
  218. result=*dest;
  219. }
  220. }
  221. return result;
  222. }
  223. //-------------------------------------------------------------------------
  224. // Iteration API
  225. //-------------------------------------------------------------------------
  226. /**
  227. * Return the current character in the normalized text.
  228. */
  229. UChar32 Normalizer::current() {
  230. if(bufferPos<buffer.length() || nextNormalize()) {
  231. return buffer.char32At(bufferPos);
  232. } else {
  233. return DONE;
  234. }
  235. }
  236. /**
  237. * Return the next character in the normalized text and advance
  238. * the iteration position by one. If the end
  239. * of the text has already been reached, {@link #DONE} is returned.
  240. */
  241. UChar32 Normalizer::next() {
  242. if(bufferPos<buffer.length() || nextNormalize()) {
  243. UChar32 c=buffer.char32At(bufferPos);
  244. bufferPos+=U16_LENGTH(c);
  245. return c;
  246. } else {
  247. return DONE;
  248. }
  249. }
  250. /**
  251. * Return the previous character in the normalized text and decrement
  252. * the iteration position by one. If the beginning
  253. * of the text has already been reached, {@link #DONE} is returned.
  254. */
  255. UChar32 Normalizer::previous() {
  256. if(bufferPos>0 || previousNormalize()) {
  257. UChar32 c=buffer.char32At(bufferPos-1);
  258. bufferPos-=U16_LENGTH(c);
  259. return c;
  260. } else {
  261. return DONE;
  262. }
  263. }
  264. void Normalizer::reset() {
  265. currentIndex=nextIndex=text->setToStart();
  266. clearBuffer();
  267. }
  268. void
  269. Normalizer::setIndexOnly(int32_t index) {
  270. text->setIndex(index); // pins index
  271. currentIndex=nextIndex=text->getIndex();
  272. clearBuffer();
  273. }
  274. /**
  275. * Return the first character in the normalized text. This resets
  276. * the <tt>Normalizer's</tt> position to the beginning of the text.
  277. */
  278. UChar32 Normalizer::first() {
  279. reset();
  280. return next();
  281. }
  282. /**
  283. * Return the last character in the normalized text. This resets
  284. * the <tt>Normalizer's</tt> position to be just before the
  285. * the input text corresponding to that normalized character.
  286. */
  287. UChar32 Normalizer::last() {
  288. currentIndex=nextIndex=text->setToEnd();
  289. clearBuffer();
  290. return previous();
  291. }
  292. /**
  293. * Retrieve the current iteration position in the input text that is
  294. * being normalized. This method is useful in applications such as
  295. * searching, where you need to be able to determine the position in
  296. * the input text that corresponds to a given normalized output character.
  297. * <p>
  298. * <b>Note:</b> This method sets the position in the <em>input</em>, while
  299. * {@link #next} and {@link #previous} iterate through characters in the
  300. * <em>output</em>. This means that there is not necessarily a one-to-one
  301. * correspondence between characters returned by <tt>next</tt> and
  302. * <tt>previous</tt> and the indices passed to and returned from
  303. * <tt>setIndex</tt> and {@link #getIndex}.
  304. *
  305. */
  306. int32_t Normalizer::getIndex() const {
  307. if(bufferPos<buffer.length()) {
  308. return currentIndex;
  309. } else {
  310. return nextIndex;
  311. }
  312. }
  313. /**
  314. * Retrieve the index of the start of the input text. This is the begin index
  315. * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
  316. * over which this <tt>Normalizer</tt> is iterating
  317. */
  318. int32_t Normalizer::startIndex() const {
  319. return text->startIndex();
  320. }
  321. /**
  322. * Retrieve the index of the end of the input text. This is the end index
  323. * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
  324. * over which this <tt>Normalizer</tt> is iterating
  325. */
  326. int32_t Normalizer::endIndex() const {
  327. return text->endIndex();
  328. }
  329. //-------------------------------------------------------------------------
  330. // Property access methods
  331. //-------------------------------------------------------------------------
  332. void
  333. Normalizer::setMode(UNormalizationMode newMode)
  334. {
  335. fUMode = newMode;
  336. init();
  337. }
  338. UNormalizationMode
  339. Normalizer::getUMode() const
  340. {
  341. return fUMode;
  342. }
  343. void
  344. Normalizer::setOption(int32_t option,
  345. UBool value)
  346. {
  347. if (value) {
  348. fOptions |= option;
  349. } else {
  350. fOptions &= (~option);
  351. }
  352. init();
  353. }
  354. UBool
  355. Normalizer::getOption(int32_t option) const
  356. {
  357. return (fOptions & option) != 0;
  358. }
  359. /**
  360. * Set the input text over which this <tt>Normalizer</tt> will iterate.
  361. * The iteration position is set to the beginning of the input text.
  362. */
  363. void
  364. Normalizer::setText(const UnicodeString& newText,
  365. UErrorCode &status)
  366. {
  367. if (U_FAILURE(status)) {
  368. return;
  369. }
  370. CharacterIterator *newIter = new StringCharacterIterator(newText);
  371. if (newIter == nullptr) {
  372. status = U_MEMORY_ALLOCATION_ERROR;
  373. return;
  374. }
  375. delete text;
  376. text = newIter;
  377. reset();
  378. }
  379. /**
  380. * Set the input text over which this <tt>Normalizer</tt> will iterate.
  381. * The iteration position is set to the beginning of the string.
  382. */
  383. void
  384. Normalizer::setText(const CharacterIterator& newText,
  385. UErrorCode &status)
  386. {
  387. if (U_FAILURE(status)) {
  388. return;
  389. }
  390. CharacterIterator *newIter = newText.clone();
  391. if (newIter == nullptr) {
  392. status = U_MEMORY_ALLOCATION_ERROR;
  393. return;
  394. }
  395. delete text;
  396. text = newIter;
  397. reset();
  398. }
  399. void
  400. Normalizer::setText(ConstChar16Ptr newText,
  401. int32_t length,
  402. UErrorCode &status)
  403. {
  404. if (U_FAILURE(status)) {
  405. return;
  406. }
  407. CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
  408. if (newIter == nullptr) {
  409. status = U_MEMORY_ALLOCATION_ERROR;
  410. return;
  411. }
  412. delete text;
  413. text = newIter;
  414. reset();
  415. }
  416. /**
  417. * Copies the text under iteration into the UnicodeString referred to by "result".
  418. * @param result Receives a copy of the text under iteration.
  419. */
  420. void
  421. Normalizer::getText(UnicodeString& result)
  422. {
  423. text->getText(result);
  424. }
  425. //-------------------------------------------------------------------------
  426. // Private utility methods
  427. //-------------------------------------------------------------------------
  428. void Normalizer::clearBuffer() {
  429. buffer.remove();
  430. bufferPos=0;
  431. }
  432. UBool
  433. Normalizer::nextNormalize() {
  434. clearBuffer();
  435. currentIndex=nextIndex;
  436. text->setIndex(nextIndex);
  437. if(!text->hasNext()) {
  438. return false;
  439. }
  440. // Skip at least one character so we make progress.
  441. UnicodeString segment(text->next32PostInc());
  442. while(text->hasNext()) {
  443. UChar32 c;
  444. if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
  445. text->move32(-1, CharacterIterator::kCurrent);
  446. break;
  447. }
  448. segment.append(c);
  449. }
  450. nextIndex=text->getIndex();
  451. UErrorCode errorCode=U_ZERO_ERROR;
  452. fNorm2->normalize(segment, buffer, errorCode);
  453. return U_SUCCESS(errorCode) && !buffer.isEmpty();
  454. }
  455. UBool
  456. Normalizer::previousNormalize() {
  457. clearBuffer();
  458. nextIndex=currentIndex;
  459. text->setIndex(currentIndex);
  460. if(!text->hasPrevious()) {
  461. return false;
  462. }
  463. UnicodeString segment;
  464. while(text->hasPrevious()) {
  465. UChar32 c=text->previous32();
  466. segment.insert(0, c);
  467. if(fNorm2->hasBoundaryBefore(c)) {
  468. break;
  469. }
  470. }
  471. currentIndex=text->getIndex();
  472. UErrorCode errorCode=U_ZERO_ERROR;
  473. fNorm2->normalize(segment, buffer, errorCode);
  474. bufferPos=buffer.length();
  475. return U_SUCCESS(errorCode) && !buffer.isEmpty();
  476. }
  477. U_NAMESPACE_END
  478. #endif /* #if !UCONFIG_NO_NORMALIZATION */