edits.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804
  1. // © 2017 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // edits.cpp
  4. // created: 2017feb08 Markus W. Scherer
  5. #include "unicode/edits.h"
  6. #include "unicode/unistr.h"
  7. #include "unicode/utypes.h"
  8. #include "cmemory.h"
  9. #include "uassert.h"
  10. #include "util.h"
  11. U_NAMESPACE_BEGIN
  12. namespace {
  13. // 0000uuuuuuuuuuuu records u+1 unchanged text units.
  14. const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
  15. const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
  16. // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.
  17. const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6;
  18. const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7;
  19. const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff;
  20. const int32_t MAX_SHORT_CHANGE = 0x6fff;
  21. // 0111mmmmmmnnnnnn records a replacement of m text units with n.
  22. // m or n = 61: actual length follows in the next edits array unit.
  23. // m or n = 62..63: actual length follows in the next two edits array units.
  24. // Bit 30 of the actual length is in the head unit.
  25. // Trailing units have bit 15 set.
  26. const int32_t LENGTH_IN_1TRAIL = 61;
  27. const int32_t LENGTH_IN_2TRAIL = 62;
  28. } // namespace
  29. void Edits::releaseArray() noexcept {
  30. if (array != stackArray) {
  31. uprv_free(array);
  32. }
  33. }
  34. Edits &Edits::copyArray(const Edits &other) {
  35. if (U_FAILURE(errorCode_)) {
  36. length = delta = numChanges = 0;
  37. return *this;
  38. }
  39. if (length > capacity) {
  40. uint16_t* newArray = static_cast<uint16_t*>(uprv_malloc(static_cast<size_t>(length) * 2));
  41. if (newArray == nullptr) {
  42. length = delta = numChanges = 0;
  43. errorCode_ = U_MEMORY_ALLOCATION_ERROR;
  44. return *this;
  45. }
  46. releaseArray();
  47. array = newArray;
  48. capacity = length;
  49. }
  50. if (length > 0) {
  51. uprv_memcpy(array, other.array, (size_t)length * 2);
  52. }
  53. return *this;
  54. }
  55. Edits &Edits::moveArray(Edits &src) noexcept {
  56. if (U_FAILURE(errorCode_)) {
  57. length = delta = numChanges = 0;
  58. return *this;
  59. }
  60. releaseArray();
  61. if (length > STACK_CAPACITY) {
  62. array = src.array;
  63. capacity = src.capacity;
  64. src.array = src.stackArray;
  65. src.capacity = STACK_CAPACITY;
  66. src.reset();
  67. return *this;
  68. }
  69. array = stackArray;
  70. capacity = STACK_CAPACITY;
  71. if (length > 0) {
  72. uprv_memcpy(array, src.array, (size_t)length * 2);
  73. }
  74. return *this;
  75. }
  76. Edits &Edits::operator=(const Edits &other) {
  77. if (this == &other) { return *this; } // self-assignment: no-op
  78. length = other.length;
  79. delta = other.delta;
  80. numChanges = other.numChanges;
  81. errorCode_ = other.errorCode_;
  82. return copyArray(other);
  83. }
  84. Edits &Edits::operator=(Edits &&src) noexcept {
  85. length = src.length;
  86. delta = src.delta;
  87. numChanges = src.numChanges;
  88. errorCode_ = src.errorCode_;
  89. return moveArray(src);
  90. }
  91. Edits::~Edits() {
  92. releaseArray();
  93. }
  94. void Edits::reset() noexcept {
  95. length = delta = numChanges = 0;
  96. errorCode_ = U_ZERO_ERROR;
  97. }
  98. void Edits::addUnchanged(int32_t unchangedLength) {
  99. if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; }
  100. if(unchangedLength < 0) {
  101. errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
  102. return;
  103. }
  104. // Merge into previous unchanged-text record, if any.
  105. int32_t last = lastUnit();
  106. if(last < MAX_UNCHANGED) {
  107. int32_t remaining = MAX_UNCHANGED - last;
  108. if (remaining >= unchangedLength) {
  109. setLastUnit(last + unchangedLength);
  110. return;
  111. }
  112. setLastUnit(MAX_UNCHANGED);
  113. unchangedLength -= remaining;
  114. }
  115. // Split large lengths into multiple units.
  116. while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
  117. append(MAX_UNCHANGED);
  118. unchangedLength -= MAX_UNCHANGED_LENGTH;
  119. }
  120. // Write a small (remaining) length.
  121. if(unchangedLength > 0) {
  122. append(unchangedLength - 1);
  123. }
  124. }
  125. void Edits::addReplace(int32_t oldLength, int32_t newLength) {
  126. if(U_FAILURE(errorCode_)) { return; }
  127. if(oldLength < 0 || newLength < 0) {
  128. errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
  129. return;
  130. }
  131. if (oldLength == 0 && newLength == 0) {
  132. return;
  133. }
  134. ++numChanges;
  135. int32_t newDelta = newLength - oldLength;
  136. if (newDelta != 0) {
  137. if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
  138. (newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) {
  139. // Integer overflow or underflow.
  140. errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
  141. return;
  142. }
  143. delta += newDelta;
  144. }
  145. if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&
  146. newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) {
  147. // Merge into previous same-lengths short-replacement record, if any.
  148. int32_t u = (oldLength << 12) | (newLength << 9);
  149. int32_t last = lastUnit();
  150. if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
  151. (last & ~SHORT_CHANGE_NUM_MASK) == u &&
  152. (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) {
  153. setLastUnit(last + 1);
  154. return;
  155. }
  156. append(u);
  157. return;
  158. }
  159. int32_t head = 0x7000;
  160. if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
  161. head |= oldLength << 6;
  162. head |= newLength;
  163. append(head);
  164. } else if ((capacity - length) >= 5 || growArray()) {
  165. int32_t limit = length + 1;
  166. if(oldLength < LENGTH_IN_1TRAIL) {
  167. head |= oldLength << 6;
  168. } else if(oldLength <= 0x7fff) {
  169. head |= LENGTH_IN_1TRAIL << 6;
  170. array[limit++] = static_cast<uint16_t>(0x8000 | oldLength);
  171. } else {
  172. head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
  173. array[limit++] = static_cast<uint16_t>(0x8000 | (oldLength >> 15));
  174. array[limit++] = static_cast<uint16_t>(0x8000 | oldLength);
  175. }
  176. if(newLength < LENGTH_IN_1TRAIL) {
  177. head |= newLength;
  178. } else if(newLength <= 0x7fff) {
  179. head |= LENGTH_IN_1TRAIL;
  180. array[limit++] = static_cast<uint16_t>(0x8000 | newLength);
  181. } else {
  182. head |= LENGTH_IN_2TRAIL + (newLength >> 30);
  183. array[limit++] = static_cast<uint16_t>(0x8000 | (newLength >> 15));
  184. array[limit++] = static_cast<uint16_t>(0x8000 | newLength);
  185. }
  186. array[length] = static_cast<uint16_t>(head);
  187. length = limit;
  188. }
  189. }
  190. void Edits::append(int32_t r) {
  191. if(length < capacity || growArray()) {
  192. array[length++] = static_cast<uint16_t>(r);
  193. }
  194. }
  195. UBool Edits::growArray() {
  196. int32_t newCapacity;
  197. if (array == stackArray) {
  198. newCapacity = 2000;
  199. } else if (capacity == INT32_MAX) {
  200. // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
  201. // with a result-string-buffer overflow.
  202. errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
  203. return false;
  204. } else if (capacity >= (INT32_MAX / 2)) {
  205. newCapacity = INT32_MAX;
  206. } else {
  207. newCapacity = 2 * capacity;
  208. }
  209. // Grow by at least 5 units so that a maximal change record will fit.
  210. if ((newCapacity - capacity) < 5) {
  211. errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
  212. return false;
  213. }
  214. uint16_t* newArray = static_cast<uint16_t*>(uprv_malloc(static_cast<size_t>(newCapacity) * 2));
  215. if (newArray == nullptr) {
  216. errorCode_ = U_MEMORY_ALLOCATION_ERROR;
  217. return false;
  218. }
  219. uprv_memcpy(newArray, array, (size_t)length * 2);
  220. releaseArray();
  221. array = newArray;
  222. capacity = newCapacity;
  223. return true;
  224. }
  225. UBool Edits::copyErrorTo(UErrorCode &outErrorCode) const {
  226. if (U_FAILURE(outErrorCode)) { return true; }
  227. if (U_SUCCESS(errorCode_)) { return false; }
  228. outErrorCode = errorCode_;
  229. return true;
  230. }
  231. Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) {
  232. if (copyErrorTo(errorCode)) { return *this; }
  233. // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
  234. // Parallel iteration over both Edits.
  235. Iterator abIter = ab.getFineIterator();
  236. Iterator bcIter = bc.getFineIterator();
  237. UBool abHasNext = true, bcHasNext = true;
  238. // Copy iterator state into local variables, so that we can modify and subdivide spans.
  239. // ab old & new length, bc old & new length
  240. int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
  241. // When we have different-intermediate-length changes, we accumulate a larger change.
  242. int32_t pending_aLength = 0, pending_cLength = 0;
  243. for (;;) {
  244. // At this point, for each of the two iterators:
  245. // Either we are done with the locally cached current edit,
  246. // and its intermediate-string length has been reset,
  247. // or we will continue to work with a truncated remainder of this edit.
  248. //
  249. // If the current edit is done, and the iterator has not yet reached the end,
  250. // then we fetch the next edit. This is true for at least one of the iterators.
  251. //
  252. // Normally it does not matter whether we fetch from ab and then bc or vice versa.
  253. // However, the result is observably different when
  254. // ab deletions meet bc insertions at the same intermediate-string index.
  255. // Some users expect the bc insertions to come first, so we fetch from bc first.
  256. if (bc_bLength == 0) {
  257. if (bcHasNext && (bcHasNext = bcIter.next(errorCode)) != 0) {
  258. bc_bLength = bcIter.oldLength();
  259. cLength = bcIter.newLength();
  260. if (bc_bLength == 0) {
  261. // insertion
  262. if (ab_bLength == 0 || !abIter.hasChange()) {
  263. addReplace(pending_aLength, pending_cLength + cLength);
  264. pending_aLength = pending_cLength = 0;
  265. } else {
  266. pending_cLength += cLength;
  267. }
  268. continue;
  269. }
  270. }
  271. // else see if the other iterator is done, too.
  272. }
  273. if (ab_bLength == 0) {
  274. if (abHasNext && (abHasNext = abIter.next(errorCode)) != 0) {
  275. aLength = abIter.oldLength();
  276. ab_bLength = abIter.newLength();
  277. if (ab_bLength == 0) {
  278. // deletion
  279. if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
  280. addReplace(pending_aLength + aLength, pending_cLength);
  281. pending_aLength = pending_cLength = 0;
  282. } else {
  283. pending_aLength += aLength;
  284. }
  285. continue;
  286. }
  287. } else if (bc_bLength == 0) {
  288. // Both iterators are done at the same time:
  289. // The intermediate-string lengths match.
  290. break;
  291. } else {
  292. // The ab output string is shorter than the bc input string.
  293. if (!copyErrorTo(errorCode)) {
  294. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  295. }
  296. return *this;
  297. }
  298. }
  299. if (bc_bLength == 0) {
  300. // The bc input string is shorter than the ab output string.
  301. if (!copyErrorTo(errorCode)) {
  302. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  303. }
  304. return *this;
  305. }
  306. // Done fetching: ab_bLength > 0 && bc_bLength > 0
  307. // The current state has two parts:
  308. // - Past: We accumulate a longer ac edit in the "pending" variables.
  309. // - Current: We have copies of the current ab/bc edits in local variables.
  310. // At least one side is newly fetched.
  311. // One side might be a truncated remainder of an edit we fetched earlier.
  312. if (!abIter.hasChange() && !bcIter.hasChange()) {
  313. // An unchanged span all the way from string a to string c.
  314. if (pending_aLength != 0 || pending_cLength != 0) {
  315. addReplace(pending_aLength, pending_cLength);
  316. pending_aLength = pending_cLength = 0;
  317. }
  318. int32_t unchangedLength = aLength <= cLength ? aLength : cLength;
  319. addUnchanged(unchangedLength);
  320. ab_bLength = aLength -= unchangedLength;
  321. bc_bLength = cLength -= unchangedLength;
  322. // At least one of the unchanged spans is now empty.
  323. continue;
  324. }
  325. if (!abIter.hasChange() && bcIter.hasChange()) {
  326. // Unchanged a->b but changed b->c.
  327. if (ab_bLength >= bc_bLength) {
  328. // Split the longer unchanged span into change + remainder.
  329. addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
  330. pending_aLength = pending_cLength = 0;
  331. aLength = ab_bLength -= bc_bLength;
  332. bc_bLength = 0;
  333. continue;
  334. }
  335. // Handle the shorter unchanged span below like a change.
  336. } else if (abIter.hasChange() && !bcIter.hasChange()) {
  337. // Changed a->b and then unchanged b->c.
  338. if (ab_bLength <= bc_bLength) {
  339. // Split the longer unchanged span into change + remainder.
  340. addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
  341. pending_aLength = pending_cLength = 0;
  342. cLength = bc_bLength -= ab_bLength;
  343. ab_bLength = 0;
  344. continue;
  345. }
  346. // Handle the shorter unchanged span below like a change.
  347. } else { // both abIter.hasChange() && bcIter.hasChange()
  348. if (ab_bLength == bc_bLength) {
  349. // Changes on both sides up to the same position. Emit & reset.
  350. addReplace(pending_aLength + aLength, pending_cLength + cLength);
  351. pending_aLength = pending_cLength = 0;
  352. ab_bLength = bc_bLength = 0;
  353. continue;
  354. }
  355. }
  356. // Accumulate the a->c change, reset the shorter side,
  357. // keep a remainder of the longer one.
  358. pending_aLength += aLength;
  359. pending_cLength += cLength;
  360. if (ab_bLength < bc_bLength) {
  361. bc_bLength -= ab_bLength;
  362. cLength = ab_bLength = 0;
  363. } else { // ab_bLength > bc_bLength
  364. ab_bLength -= bc_bLength;
  365. aLength = bc_bLength = 0;
  366. }
  367. }
  368. if (pending_aLength != 0 || pending_cLength != 0) {
  369. addReplace(pending_aLength, pending_cLength);
  370. }
  371. copyErrorTo(errorCode);
  372. return *this;
  373. }
  374. Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
  375. array(a), index(0), length(len), remaining(0),
  376. onlyChanges_(oc), coarse(crs),
  377. dir(0), changed(false), oldLength_(0), newLength_(0),
  378. srcIndex(0), replIndex(0), destIndex(0) {}
  379. int32_t Edits::Iterator::readLength(int32_t head) {
  380. if (head < LENGTH_IN_1TRAIL) {
  381. return head;
  382. } else if (head < LENGTH_IN_2TRAIL) {
  383. U_ASSERT(index < length);
  384. U_ASSERT(array[index] >= 0x8000);
  385. return array[index++] & 0x7fff;
  386. } else {
  387. U_ASSERT((index + 2) <= length);
  388. U_ASSERT(array[index] >= 0x8000);
  389. U_ASSERT(array[index + 1] >= 0x8000);
  390. int32_t len = ((head & 1) << 30) |
  391. (static_cast<int32_t>(array[index] & 0x7fff) << 15) |
  392. (array[index + 1] & 0x7fff);
  393. index += 2;
  394. return len;
  395. }
  396. }
  397. void Edits::Iterator::updateNextIndexes() {
  398. srcIndex += oldLength_;
  399. if (changed) {
  400. replIndex += newLength_;
  401. }
  402. destIndex += newLength_;
  403. }
  404. void Edits::Iterator::updatePreviousIndexes() {
  405. srcIndex -= oldLength_;
  406. if (changed) {
  407. replIndex -= newLength_;
  408. }
  409. destIndex -= newLength_;
  410. }
  411. UBool Edits::Iterator::noNext() {
  412. // No change before or beyond the string.
  413. dir = 0;
  414. changed = false;
  415. oldLength_ = newLength_ = 0;
  416. return false;
  417. }
  418. UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
  419. // Forward iteration: Update the string indexes to the limit of the current span,
  420. // and post-increment-read array units to assemble a new span.
  421. // Leaves the array index one after the last unit of that span.
  422. if (U_FAILURE(errorCode)) { return false; }
  423. // We have an errorCode in case we need to start guarding against integer overflows.
  424. // It is also convenient for caller loops if we bail out when an error was set elsewhere.
  425. if (dir > 0) {
  426. updateNextIndexes();
  427. } else {
  428. if (dir < 0) {
  429. // Turn around from previous() to next().
  430. // Post-increment-read the same span again.
  431. if (remaining > 0) {
  432. // Fine-grained iterator:
  433. // Stay on the current one of a sequence of compressed changes.
  434. ++index; // next() rests on the index after the sequence unit.
  435. dir = 1;
  436. return true;
  437. }
  438. }
  439. dir = 1;
  440. }
  441. if (remaining >= 1) {
  442. // Fine-grained iterator: Continue a sequence of compressed changes.
  443. if (remaining > 1) {
  444. --remaining;
  445. return true;
  446. }
  447. remaining = 0;
  448. }
  449. if (index >= length) {
  450. return noNext();
  451. }
  452. int32_t u = array[index++];
  453. if (u <= MAX_UNCHANGED) {
  454. // Combine adjacent unchanged ranges.
  455. changed = false;
  456. oldLength_ = u + 1;
  457. while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
  458. ++index;
  459. oldLength_ += u + 1;
  460. }
  461. newLength_ = oldLength_;
  462. if (onlyChanges) {
  463. updateNextIndexes();
  464. if (index >= length) {
  465. return noNext();
  466. }
  467. // already fetched u > MAX_UNCHANGED at index
  468. ++index;
  469. } else {
  470. return true;
  471. }
  472. }
  473. changed = true;
  474. if (u <= MAX_SHORT_CHANGE) {
  475. int32_t oldLen = u >> 12;
  476. int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
  477. int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
  478. if (coarse) {
  479. oldLength_ = num * oldLen;
  480. newLength_ = num * newLen;
  481. } else {
  482. // Split a sequence of changes that was compressed into one unit.
  483. oldLength_ = oldLen;
  484. newLength_ = newLen;
  485. if (num > 1) {
  486. remaining = num; // This is the first of two or more changes.
  487. }
  488. return true;
  489. }
  490. } else {
  491. U_ASSERT(u <= 0x7fff);
  492. oldLength_ = readLength((u >> 6) & 0x3f);
  493. newLength_ = readLength(u & 0x3f);
  494. if (!coarse) {
  495. return true;
  496. }
  497. }
  498. // Combine adjacent changes.
  499. while (index < length && (u = array[index]) > MAX_UNCHANGED) {
  500. ++index;
  501. if (u <= MAX_SHORT_CHANGE) {
  502. int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
  503. oldLength_ += (u >> 12) * num;
  504. newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
  505. } else {
  506. U_ASSERT(u <= 0x7fff);
  507. oldLength_ += readLength((u >> 6) & 0x3f);
  508. newLength_ += readLength(u & 0x3f);
  509. }
  510. }
  511. return true;
  512. }
  513. UBool Edits::Iterator::previous(UErrorCode &errorCode) {
  514. // Backward iteration: Pre-decrement-read array units to assemble a new span,
  515. // then update the string indexes to the start of that span.
  516. // Leaves the array index on the head unit of that span.
  517. if (U_FAILURE(errorCode)) { return false; }
  518. // We have an errorCode in case we need to start guarding against integer overflows.
  519. // It is also convenient for caller loops if we bail out when an error was set elsewhere.
  520. if (dir >= 0) {
  521. if (dir > 0) {
  522. // Turn around from next() to previous().
  523. // Set the string indexes to the span limit and
  524. // pre-decrement-read the same span again.
  525. if (remaining > 0) {
  526. // Fine-grained iterator:
  527. // Stay on the current one of a sequence of compressed changes.
  528. --index; // previous() rests on the sequence unit.
  529. dir = -1;
  530. return true;
  531. }
  532. updateNextIndexes();
  533. }
  534. dir = -1;
  535. }
  536. if (remaining > 0) {
  537. // Fine-grained iterator: Continue a sequence of compressed changes.
  538. int32_t u = array[index];
  539. U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
  540. if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
  541. ++remaining;
  542. updatePreviousIndexes();
  543. return true;
  544. }
  545. remaining = 0;
  546. }
  547. if (index <= 0) {
  548. return noNext();
  549. }
  550. int32_t u = array[--index];
  551. if (u <= MAX_UNCHANGED) {
  552. // Combine adjacent unchanged ranges.
  553. changed = false;
  554. oldLength_ = u + 1;
  555. while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
  556. --index;
  557. oldLength_ += u + 1;
  558. }
  559. newLength_ = oldLength_;
  560. // No need to handle onlyChanges as long as previous() is called only from findIndex().
  561. updatePreviousIndexes();
  562. return true;
  563. }
  564. changed = true;
  565. if (u <= MAX_SHORT_CHANGE) {
  566. int32_t oldLen = u >> 12;
  567. int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
  568. int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
  569. if (coarse) {
  570. oldLength_ = num * oldLen;
  571. newLength_ = num * newLen;
  572. } else {
  573. // Split a sequence of changes that was compressed into one unit.
  574. oldLength_ = oldLen;
  575. newLength_ = newLen;
  576. if (num > 1) {
  577. remaining = 1; // This is the last of two or more changes.
  578. }
  579. updatePreviousIndexes();
  580. return true;
  581. }
  582. } else {
  583. if (u <= 0x7fff) {
  584. // The change is encoded in u alone.
  585. oldLength_ = readLength((u >> 6) & 0x3f);
  586. newLength_ = readLength(u & 0x3f);
  587. } else {
  588. // Back up to the head of the change, read the lengths,
  589. // and reset the index to the head again.
  590. U_ASSERT(index > 0);
  591. while ((u = array[--index]) > 0x7fff) {}
  592. U_ASSERT(u > MAX_SHORT_CHANGE);
  593. int32_t headIndex = index++;
  594. oldLength_ = readLength((u >> 6) & 0x3f);
  595. newLength_ = readLength(u & 0x3f);
  596. index = headIndex;
  597. }
  598. if (!coarse) {
  599. updatePreviousIndexes();
  600. return true;
  601. }
  602. }
  603. // Combine adjacent changes.
  604. while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
  605. --index;
  606. if (u <= MAX_SHORT_CHANGE) {
  607. int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
  608. oldLength_ += (u >> 12) * num;
  609. newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
  610. } else if (u <= 0x7fff) {
  611. // Read the lengths, and reset the index to the head again.
  612. int32_t headIndex = index++;
  613. oldLength_ += readLength((u >> 6) & 0x3f);
  614. newLength_ += readLength(u & 0x3f);
  615. index = headIndex;
  616. }
  617. }
  618. updatePreviousIndexes();
  619. return true;
  620. }
  621. int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) {
  622. if (U_FAILURE(errorCode) || i < 0) { return -1; }
  623. int32_t spanStart, spanLength;
  624. if (findSource) { // find source index
  625. spanStart = srcIndex;
  626. spanLength = oldLength_;
  627. } else { // find destination index
  628. spanStart = destIndex;
  629. spanLength = newLength_;
  630. }
  631. if (i < spanStart) {
  632. if (i >= (spanStart / 2)) {
  633. // Search backwards.
  634. for (;;) {
  635. UBool hasPrevious = previous(errorCode);
  636. U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0
  637. (void)hasPrevious; // avoid unused-variable warning
  638. spanStart = findSource ? srcIndex : destIndex;
  639. if (i >= spanStart) {
  640. // The index is in the current span.
  641. return 0;
  642. }
  643. if (remaining > 0) {
  644. // Is the index in one of the remaining compressed edits?
  645. // spanStart is the start of the current span, first of the remaining ones.
  646. spanLength = findSource ? oldLength_ : newLength_;
  647. int32_t u = array[index];
  648. U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
  649. int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
  650. int32_t len = num * spanLength;
  651. if (i >= (spanStart - len)) {
  652. int32_t n = ((spanStart - i - 1) / spanLength) + 1;
  653. // 1 <= n <= num
  654. srcIndex -= n * oldLength_;
  655. replIndex -= n * newLength_;
  656. destIndex -= n * newLength_;
  657. remaining += n;
  658. return 0;
  659. }
  660. // Skip all of these edits at once.
  661. srcIndex -= num * oldLength_;
  662. replIndex -= num * newLength_;
  663. destIndex -= num * newLength_;
  664. remaining = 0;
  665. }
  666. }
  667. }
  668. // Reset the iterator to the start.
  669. dir = 0;
  670. index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
  671. } else if (i < (spanStart + spanLength)) {
  672. // The index is in the current span.
  673. return 0;
  674. }
  675. while (next(false, errorCode)) {
  676. if (findSource) {
  677. spanStart = srcIndex;
  678. spanLength = oldLength_;
  679. } else {
  680. spanStart = destIndex;
  681. spanLength = newLength_;
  682. }
  683. if (i < (spanStart + spanLength)) {
  684. // The index is in the current span.
  685. return 0;
  686. }
  687. if (remaining > 1) {
  688. // Is the index in one of the remaining compressed edits?
  689. // spanStart is the start of the current span, first of the remaining ones.
  690. int32_t len = remaining * spanLength;
  691. if (i < (spanStart + len)) {
  692. int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1
  693. srcIndex += n * oldLength_;
  694. replIndex += n * newLength_;
  695. destIndex += n * newLength_;
  696. remaining -= n;
  697. return 0;
  698. }
  699. // Make next() skip all of these edits at once.
  700. oldLength_ *= remaining;
  701. newLength_ *= remaining;
  702. remaining = 0;
  703. }
  704. }
  705. return 1;
  706. }
  707. int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) {
  708. int32_t where = findIndex(i, true, errorCode);
  709. if (where < 0) {
  710. // Error or before the string.
  711. return 0;
  712. }
  713. if (where > 0 || i == srcIndex) {
  714. // At or after string length, or at start of the found span.
  715. return destIndex;
  716. }
  717. if (changed) {
  718. // In a change span, map to its end.
  719. return destIndex + newLength_;
  720. } else {
  721. // In an unchanged span, offset 1:1 within it.
  722. return destIndex + (i - srcIndex);
  723. }
  724. }
  725. int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) {
  726. int32_t where = findIndex(i, false, errorCode);
  727. if (where < 0) {
  728. // Error or before the string.
  729. return 0;
  730. }
  731. if (where > 0 || i == destIndex) {
  732. // At or after string length, or at start of the found span.
  733. return srcIndex;
  734. }
  735. if (changed) {
  736. // In a change span, map to its end.
  737. return srcIndex + oldLength_;
  738. } else {
  739. // In an unchanged span, offset within it.
  740. return srcIndex + (i - destIndex);
  741. }
  742. }
  743. UnicodeString& Edits::Iterator::toString(UnicodeString& sb) const {
  744. sb.append(u"{ src[", -1);
  745. ICU_Utility::appendNumber(sb, srcIndex);
  746. sb.append(u"..", -1);
  747. ICU_Utility::appendNumber(sb, srcIndex + oldLength_);
  748. if (changed) {
  749. sb.append(u"] ⇝ dest[", -1);
  750. } else {
  751. sb.append(u"] ≡ dest[", -1);
  752. }
  753. ICU_Utility::appendNumber(sb, destIndex);
  754. sb.append(u"..", -1);
  755. ICU_Utility::appendNumber(sb, destIndex + newLength_);
  756. if (changed) {
  757. sb.append(u"], repl[", -1);
  758. ICU_Utility::appendNumber(sb, replIndex);
  759. sb.append(u"..", -1);
  760. ICU_Utility::appendNumber(sb, replIndex + newLength_);
  761. sb.append(u"] }", -1);
  762. } else {
  763. sb.append(u"] (no-change) }", -1);
  764. }
  765. return sb;
  766. }
  767. U_NAMESPACE_END