locdistance.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. // © 2019 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // locdistance.cpp
  4. // created: 2019may08 Markus W. Scherer
  5. #include "unicode/utypes.h"
  6. #include "unicode/bytestrie.h"
  7. #include "unicode/localematcher.h"
  8. #include "unicode/locid.h"
  9. #include "unicode/uobject.h"
  10. #include "unicode/ures.h"
  11. #include "cstring.h"
  12. #include "locdistance.h"
  13. #include "loclikelysubtags.h"
  14. #include "uassert.h"
  15. #include "ucln_cmn.h"
  16. #include "uinvchar.h"
  17. #include "umutex.h"
  18. U_NAMESPACE_BEGIN
  19. namespace {
  20. /**
  21. * Bit flag used on the last character of a subtag in the trie.
  22. * Must be set consistently by the builder and the lookup code.
  23. */
  24. constexpr int32_t END_OF_SUBTAG = 0x80;
  25. /** Distance value bit flag, set by the builder. */
  26. constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
  27. /** Distance value bit flag, set by trieNext(). */
  28. constexpr int32_t DISTANCE_IS_FINAL = 0x100;
  29. constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
  30. constexpr int32_t ABOVE_THRESHOLD = 100;
  31. // Indexes into array of distances.
  32. enum {
  33. IX_DEF_LANG_DISTANCE,
  34. IX_DEF_SCRIPT_DISTANCE,
  35. IX_DEF_REGION_DISTANCE,
  36. IX_MIN_REGION_DISTANCE,
  37. IX_LIMIT
  38. };
  39. LocaleDistance *gLocaleDistance = nullptr;
  40. UInitOnce gInitOnce {};
  41. UBool U_CALLCONV cleanup() {
  42. delete gLocaleDistance;
  43. gLocaleDistance = nullptr;
  44. gInitOnce.reset();
  45. return true;
  46. }
  47. } // namespace
  48. void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
  49. // This function is invoked only via umtx_initOnce().
  50. U_ASSERT(gLocaleDistance == nullptr);
  51. const LikelySubtags &likely = *LikelySubtags::getSingleton(errorCode);
  52. if (U_FAILURE(errorCode)) { return; }
  53. const LocaleDistanceData &data = likely.getDistanceData();
  54. if (data.distanceTrieBytes == nullptr ||
  55. data.regionToPartitions == nullptr || data.partitions == nullptr ||
  56. // ok if no paradigms
  57. data.distances == nullptr) {
  58. errorCode = U_MISSING_RESOURCE_ERROR;
  59. return;
  60. }
  61. gLocaleDistance = new LocaleDistance(data, likely);
  62. if (gLocaleDistance == nullptr) {
  63. errorCode = U_MEMORY_ALLOCATION_ERROR;
  64. return;
  65. }
  66. ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
  67. }
  68. const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
  69. if (U_FAILURE(errorCode)) { return nullptr; }
  70. umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
  71. return gLocaleDistance;
  72. }
  73. LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely) :
  74. likelySubtags(likely),
  75. trie(data.distanceTrieBytes),
  76. regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
  77. paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
  78. defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
  79. defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
  80. defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
  81. minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
  82. // For the default demotion value, use the
  83. // default region distance between unrelated Englishes.
  84. // Thus, unless demotion is turned off,
  85. // a mere region difference for one desired locale
  86. // is as good as a perfect match for the next following desired locale.
  87. // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
  88. LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
  89. LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
  90. const LSR *p_enGB = &enGB;
  91. int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
  92. shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
  93. defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
  94. }
  95. int32_t LocaleDistance::getBestIndexAndDistance(
  96. const LSR &desired,
  97. const LSR **supportedLSRs, int32_t supportedLSRsLength,
  98. int32_t shiftedThreshold,
  99. ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
  100. BytesTrie iter(trie);
  101. // Look up the desired language only once for all supported LSRs.
  102. // Its "distance" is either a match point value of 0, or a non-match negative value.
  103. // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
  104. int32_t desLangDistance = trieNext(iter, desired.language, false);
  105. uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
  106. // Index of the supported LSR with the lowest distance.
  107. int32_t bestIndex = -1;
  108. // Cached lookup info from LikelySubtags.compareLikely().
  109. int32_t bestLikelyInfo = -1;
  110. for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
  111. const LSR &supported = *supportedLSRs[slIndex];
  112. bool star = false;
  113. int32_t distance = desLangDistance;
  114. if (distance >= 0) {
  115. U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
  116. if (slIndex != 0) {
  117. iter.resetToState64(desLangState);
  118. }
  119. distance = trieNext(iter, supported.language, true);
  120. }
  121. // Note: The data builder verifies that there are no rules with "any" (*) language and
  122. // real (non *) script or region subtags.
  123. // This means that if the lookup for either language fails we can use
  124. // the default distances without further lookups.
  125. int32_t flags;
  126. if (distance >= 0) {
  127. flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
  128. distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
  129. } else { // <*, *>
  130. if (uprv_strcmp(desired.language, supported.language) == 0) {
  131. distance = 0;
  132. } else {
  133. distance = defaultLanguageDistance;
  134. }
  135. flags = 0;
  136. star = true;
  137. }
  138. U_ASSERT(0 <= distance && distance <= 100);
  139. // Round up the shifted threshold (if fraction bits are not 0)
  140. // for comparison with un-shifted distances until we need fraction bits.
  141. // (If we simply shifted non-zero fraction bits away, then we might ignore a language
  142. // when it's really still a micro distance below the threshold.)
  143. int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
  144. // We implement "favor subtag" by reducing the language subtag distance
  145. // (unscientifically reducing it to a quarter of the normal value),
  146. // so that the script distance is relatively more important.
  147. // For example, given a default language distance of 80, we reduce it to 20,
  148. // which is below the default threshold of 50, which is the default script distance.
  149. if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
  150. distance >>= 2;
  151. }
  152. // Let distance == roundedThreshold pass until the tie-breaker logic
  153. // at the end of the loop.
  154. if (distance > roundedThreshold) {
  155. continue;
  156. }
  157. int32_t scriptDistance;
  158. if (star || flags != 0) {
  159. if (uprv_strcmp(desired.script, supported.script) == 0) {
  160. scriptDistance = 0;
  161. } else {
  162. scriptDistance = defaultScriptDistance;
  163. }
  164. } else {
  165. scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
  166. desired.script, supported.script);
  167. flags = scriptDistance & DISTANCE_IS_FINAL;
  168. scriptDistance &= ~DISTANCE_IS_FINAL;
  169. }
  170. distance += scriptDistance;
  171. if (distance > roundedThreshold) {
  172. continue;
  173. }
  174. if (uprv_strcmp(desired.region, supported.region) == 0) {
  175. // regionDistance = 0
  176. } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
  177. distance += defaultRegionDistance;
  178. } else {
  179. int32_t remainingThreshold = roundedThreshold - distance;
  180. if (minRegionDistance > remainingThreshold) {
  181. continue;
  182. }
  183. // From here on we know the regions are not equal.
  184. // Map each region to zero or more partitions. (zero = one non-matching string)
  185. // (Each array of single-character partition strings is encoded as one string.)
  186. // If either side has more than one, then we find the maximum distance.
  187. // This could be optimized by adding some more structure, but probably not worth it.
  188. distance += getRegionPartitionsDistance(
  189. iter, iter.getState64(),
  190. partitionsForRegion(desired),
  191. partitionsForRegion(supported),
  192. remainingThreshold);
  193. }
  194. int32_t shiftedDistance = shiftDistance(distance);
  195. if (shiftedDistance == 0) {
  196. // Distinguish between equivalent but originally unequal locales via an
  197. // additional micro distance.
  198. shiftedDistance |= (desired.flags ^ supported.flags);
  199. if (shiftedDistance < shiftedThreshold) {
  200. if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
  201. // Is there also a match when we swap desired/supported?
  202. isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
  203. if (shiftedDistance == 0) {
  204. return slIndex << INDEX_SHIFT;
  205. }
  206. bestIndex = slIndex;
  207. shiftedThreshold = shiftedDistance;
  208. bestLikelyInfo = -1;
  209. }
  210. }
  211. } else {
  212. if (shiftedDistance < shiftedThreshold) {
  213. if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
  214. // Is there also a match when we swap desired/supported?
  215. isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
  216. bestIndex = slIndex;
  217. shiftedThreshold = shiftedDistance;
  218. bestLikelyInfo = -1;
  219. }
  220. } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
  221. if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
  222. // Is there also a match when we swap desired/supported?
  223. isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
  224. bestLikelyInfo = likelySubtags.compareLikely(
  225. supported, *supportedLSRs[bestIndex], bestLikelyInfo);
  226. if ((bestLikelyInfo & 1) != 0) {
  227. // This supported locale matches as well as the previous best match,
  228. // and neither matches perfectly,
  229. // but this one is "more likely" (has more-default subtags).
  230. bestIndex = slIndex;
  231. }
  232. }
  233. }
  234. }
  235. }
  236. return bestIndex >= 0 ?
  237. (bestIndex << INDEX_SHIFT) | shiftedThreshold :
  238. INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
  239. }
  240. int32_t LocaleDistance::getDesSuppScriptDistance(
  241. BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
  242. // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
  243. int32_t distance = trieNext(iter, desired, false);
  244. if (distance >= 0) {
  245. distance = trieNext(iter, supported, true);
  246. }
  247. if (distance < 0) {
  248. UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *>
  249. U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
  250. if (uprv_strcmp(desired, supported) == 0) {
  251. distance = 0; // same script
  252. } else {
  253. distance = iter.getValue();
  254. U_ASSERT(distance >= 0);
  255. }
  256. if (result == USTRINGTRIE_FINAL_VALUE) {
  257. distance |= DISTANCE_IS_FINAL;
  258. }
  259. }
  260. return distance;
  261. }
  262. int32_t LocaleDistance::getRegionPartitionsDistance(
  263. BytesTrie &iter, uint64_t startState,
  264. const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
  265. char desired = *desiredPartitions++;
  266. char supported = *supportedPartitions++;
  267. U_ASSERT(desired != 0 && supported != 0);
  268. // See if we have single desired/supported partitions, from NUL-terminated
  269. // partition strings without explicit length.
  270. bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character
  271. // equivalent to: if (desLength == 1 && suppLength == 1)
  272. if (*desiredPartitions == 0 && !suppLengthGt1) {
  273. // Fastpath for single desired/supported partitions.
  274. UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
  275. if (USTRINGTRIE_HAS_NEXT(result)) {
  276. result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
  277. if (USTRINGTRIE_HAS_VALUE(result)) {
  278. return iter.getValue();
  279. }
  280. }
  281. return getFallbackRegionDistance(iter, startState);
  282. }
  283. const char *supportedStart = supportedPartitions - 1; // for restart of inner loop
  284. int32_t regionDistance = 0;
  285. // Fall back to * only once, not for each pair of partition strings.
  286. bool star = false;
  287. for (;;) {
  288. // Look up each desired-partition string only once,
  289. // not for each (desired, supported) pair.
  290. UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
  291. if (USTRINGTRIE_HAS_NEXT(result)) {
  292. uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
  293. for (;;) {
  294. result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
  295. int32_t d;
  296. if (USTRINGTRIE_HAS_VALUE(result)) {
  297. d = iter.getValue();
  298. } else if (star) {
  299. d = 0;
  300. } else {
  301. d = getFallbackRegionDistance(iter, startState);
  302. star = true;
  303. }
  304. if (d > threshold) {
  305. return d;
  306. } else if (regionDistance < d) {
  307. regionDistance = d;
  308. }
  309. if ((supported = *supportedPartitions++) != 0) {
  310. iter.resetToState64(desState);
  311. } else {
  312. break;
  313. }
  314. }
  315. } else if (!star) {
  316. int32_t d = getFallbackRegionDistance(iter, startState);
  317. if (d > threshold) {
  318. return d;
  319. } else if (regionDistance < d) {
  320. regionDistance = d;
  321. }
  322. star = true;
  323. }
  324. if ((desired = *desiredPartitions++) != 0) {
  325. iter.resetToState64(startState);
  326. supportedPartitions = supportedStart;
  327. supported = *supportedPartitions++;
  328. } else {
  329. break;
  330. }
  331. }
  332. return regionDistance;
  333. }
  334. int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
  335. #if U_DEBUG
  336. UStringTrieResult result =
  337. #endif
  338. iter.resetToState64(startState).next(u'*'); // <*, *>
  339. U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
  340. int32_t distance = iter.getValue();
  341. U_ASSERT(distance >= 0);
  342. return distance;
  343. }
  344. int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
  345. uint8_t c;
  346. if ((c = *s) == 0) {
  347. return -1; // no empty subtags in the distance data
  348. }
  349. for (;;) {
  350. c = uprv_invCharToAscii(c);
  351. // EBCDIC: If *s is not an invariant character,
  352. // then c is now 0 and will simply not match anything, which is harmless.
  353. uint8_t next = *++s;
  354. if (next != 0) {
  355. if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
  356. return -1;
  357. }
  358. } else {
  359. // last character of this subtag
  360. UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
  361. if (wantValue) {
  362. if (USTRINGTRIE_HAS_VALUE(result)) {
  363. int32_t value = iter.getValue();
  364. if (result == USTRINGTRIE_FINAL_VALUE) {
  365. value |= DISTANCE_IS_FINAL;
  366. }
  367. return value;
  368. }
  369. } else {
  370. if (USTRINGTRIE_HAS_NEXT(result)) {
  371. return 0;
  372. }
  373. }
  374. return -1;
  375. }
  376. c = next;
  377. }
  378. }
  379. bool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
  380. // Linear search for a very short list (length 6 as of 2019),
  381. // because we look for equivalence not equality, and
  382. // because it's easy.
  383. // If there are many paradigm LSRs we should use a hash set
  384. // with custom comparator and hasher.
  385. U_ASSERT(paradigmLSRsLength <= 15);
  386. for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
  387. if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
  388. }
  389. return false;
  390. }
  391. U_NAMESPACE_END