123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415 |
- // © 2019 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- // locdistance.cpp
- // created: 2019may08 Markus W. Scherer
- #include "unicode/utypes.h"
- #include "unicode/bytestrie.h"
- #include "unicode/localematcher.h"
- #include "unicode/locid.h"
- #include "unicode/uobject.h"
- #include "unicode/ures.h"
- #include "cstring.h"
- #include "locdistance.h"
- #include "loclikelysubtags.h"
- #include "uassert.h"
- #include "ucln_cmn.h"
- #include "uinvchar.h"
- #include "umutex.h"
- U_NAMESPACE_BEGIN
- namespace {
- /**
- * Bit flag used on the last character of a subtag in the trie.
- * Must be set consistently by the builder and the lookup code.
- */
- constexpr int32_t END_OF_SUBTAG = 0x80;
- /** Distance value bit flag, set by the builder. */
- constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
- /** Distance value bit flag, set by trieNext(). */
- constexpr int32_t DISTANCE_IS_FINAL = 0x100;
- constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
- constexpr int32_t ABOVE_THRESHOLD = 100;
- // Indexes into array of distances.
- enum {
- IX_DEF_LANG_DISTANCE,
- IX_DEF_SCRIPT_DISTANCE,
- IX_DEF_REGION_DISTANCE,
- IX_MIN_REGION_DISTANCE,
- IX_LIMIT
- };
- LocaleDistance *gLocaleDistance = nullptr;
- UInitOnce gInitOnce {};
- UBool U_CALLCONV cleanup() {
- delete gLocaleDistance;
- gLocaleDistance = nullptr;
- gInitOnce.reset();
- return true;
- }
- } // namespace
- void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
- // This function is invoked only via umtx_initOnce().
- U_ASSERT(gLocaleDistance == nullptr);
- const LikelySubtags &likely = *LikelySubtags::getSingleton(errorCode);
- if (U_FAILURE(errorCode)) { return; }
- const LocaleDistanceData &data = likely.getDistanceData();
- if (data.distanceTrieBytes == nullptr ||
- data.regionToPartitions == nullptr || data.partitions == nullptr ||
- // ok if no paradigms
- data.distances == nullptr) {
- errorCode = U_MISSING_RESOURCE_ERROR;
- return;
- }
- gLocaleDistance = new LocaleDistance(data, likely);
- if (gLocaleDistance == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
- }
- const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return nullptr; }
- umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
- return gLocaleDistance;
- }
- LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely) :
- likelySubtags(likely),
- trie(data.distanceTrieBytes),
- regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
- paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
- defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
- defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
- defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
- minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
- // For the default demotion value, use the
- // default region distance between unrelated Englishes.
- // Thus, unless demotion is turned off,
- // a mere region difference for one desired locale
- // is as good as a perfect match for the next following desired locale.
- // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
- LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
- LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
- const LSR *p_enGB = &enGB;
- int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
- shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
- defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
- }
- int32_t LocaleDistance::getBestIndexAndDistance(
- const LSR &desired,
- const LSR **supportedLSRs, int32_t supportedLSRsLength,
- int32_t shiftedThreshold,
- ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
- BytesTrie iter(trie);
- // Look up the desired language only once for all supported LSRs.
- // Its "distance" is either a match point value of 0, or a non-match negative value.
- // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
- int32_t desLangDistance = trieNext(iter, desired.language, false);
- uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
- // Index of the supported LSR with the lowest distance.
- int32_t bestIndex = -1;
- // Cached lookup info from LikelySubtags.compareLikely().
- int32_t bestLikelyInfo = -1;
- for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
- const LSR &supported = *supportedLSRs[slIndex];
- bool star = false;
- int32_t distance = desLangDistance;
- if (distance >= 0) {
- U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
- if (slIndex != 0) {
- iter.resetToState64(desLangState);
- }
- distance = trieNext(iter, supported.language, true);
- }
- // Note: The data builder verifies that there are no rules with "any" (*) language and
- // real (non *) script or region subtags.
- // This means that if the lookup for either language fails we can use
- // the default distances without further lookups.
- int32_t flags;
- if (distance >= 0) {
- flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
- distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
- } else { // <*, *>
- if (uprv_strcmp(desired.language, supported.language) == 0) {
- distance = 0;
- } else {
- distance = defaultLanguageDistance;
- }
- flags = 0;
- star = true;
- }
- U_ASSERT(0 <= distance && distance <= 100);
- // Round up the shifted threshold (if fraction bits are not 0)
- // for comparison with un-shifted distances until we need fraction bits.
- // (If we simply shifted non-zero fraction bits away, then we might ignore a language
- // when it's really still a micro distance below the threshold.)
- int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
- // We implement "favor subtag" by reducing the language subtag distance
- // (unscientifically reducing it to a quarter of the normal value),
- // so that the script distance is relatively more important.
- // For example, given a default language distance of 80, we reduce it to 20,
- // which is below the default threshold of 50, which is the default script distance.
- if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
- distance >>= 2;
- }
- // Let distance == roundedThreshold pass until the tie-breaker logic
- // at the end of the loop.
- if (distance > roundedThreshold) {
- continue;
- }
- int32_t scriptDistance;
- if (star || flags != 0) {
- if (uprv_strcmp(desired.script, supported.script) == 0) {
- scriptDistance = 0;
- } else {
- scriptDistance = defaultScriptDistance;
- }
- } else {
- scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
- desired.script, supported.script);
- flags = scriptDistance & DISTANCE_IS_FINAL;
- scriptDistance &= ~DISTANCE_IS_FINAL;
- }
- distance += scriptDistance;
- if (distance > roundedThreshold) {
- continue;
- }
- if (uprv_strcmp(desired.region, supported.region) == 0) {
- // regionDistance = 0
- } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
- distance += defaultRegionDistance;
- } else {
- int32_t remainingThreshold = roundedThreshold - distance;
- if (minRegionDistance > remainingThreshold) {
- continue;
- }
- // From here on we know the regions are not equal.
- // Map each region to zero or more partitions. (zero = one non-matching string)
- // (Each array of single-character partition strings is encoded as one string.)
- // If either side has more than one, then we find the maximum distance.
- // This could be optimized by adding some more structure, but probably not worth it.
- distance += getRegionPartitionsDistance(
- iter, iter.getState64(),
- partitionsForRegion(desired),
- partitionsForRegion(supported),
- remainingThreshold);
- }
- int32_t shiftedDistance = shiftDistance(distance);
- if (shiftedDistance == 0) {
- // Distinguish between equivalent but originally unequal locales via an
- // additional micro distance.
- shiftedDistance |= (desired.flags ^ supported.flags);
- if (shiftedDistance < shiftedThreshold) {
- if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
- // Is there also a match when we swap desired/supported?
- isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
- if (shiftedDistance == 0) {
- return slIndex << INDEX_SHIFT;
- }
- bestIndex = slIndex;
- shiftedThreshold = shiftedDistance;
- bestLikelyInfo = -1;
- }
- }
- } else {
- if (shiftedDistance < shiftedThreshold) {
- if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
- // Is there also a match when we swap desired/supported?
- isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
- bestIndex = slIndex;
- shiftedThreshold = shiftedDistance;
- bestLikelyInfo = -1;
- }
- } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
- if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
- // Is there also a match when we swap desired/supported?
- isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
- bestLikelyInfo = likelySubtags.compareLikely(
- supported, *supportedLSRs[bestIndex], bestLikelyInfo);
- if ((bestLikelyInfo & 1) != 0) {
- // This supported locale matches as well as the previous best match,
- // and neither matches perfectly,
- // but this one is "more likely" (has more-default subtags).
- bestIndex = slIndex;
- }
- }
- }
- }
- }
- return bestIndex >= 0 ?
- (bestIndex << INDEX_SHIFT) | shiftedThreshold :
- INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
- }
- int32_t LocaleDistance::getDesSuppScriptDistance(
- BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
- // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
- int32_t distance = trieNext(iter, desired, false);
- if (distance >= 0) {
- distance = trieNext(iter, supported, true);
- }
- if (distance < 0) {
- UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *>
- U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
- if (uprv_strcmp(desired, supported) == 0) {
- distance = 0; // same script
- } else {
- distance = iter.getValue();
- U_ASSERT(distance >= 0);
- }
- if (result == USTRINGTRIE_FINAL_VALUE) {
- distance |= DISTANCE_IS_FINAL;
- }
- }
- return distance;
- }
- int32_t LocaleDistance::getRegionPartitionsDistance(
- BytesTrie &iter, uint64_t startState,
- const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
- char desired = *desiredPartitions++;
- char supported = *supportedPartitions++;
- U_ASSERT(desired != 0 && supported != 0);
- // See if we have single desired/supported partitions, from NUL-terminated
- // partition strings without explicit length.
- bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character
- // equivalent to: if (desLength == 1 && suppLength == 1)
- if (*desiredPartitions == 0 && !suppLengthGt1) {
- // Fastpath for single desired/supported partitions.
- UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
- if (USTRINGTRIE_HAS_NEXT(result)) {
- result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
- if (USTRINGTRIE_HAS_VALUE(result)) {
- return iter.getValue();
- }
- }
- return getFallbackRegionDistance(iter, startState);
- }
- const char *supportedStart = supportedPartitions - 1; // for restart of inner loop
- int32_t regionDistance = 0;
- // Fall back to * only once, not for each pair of partition strings.
- bool star = false;
- for (;;) {
- // Look up each desired-partition string only once,
- // not for each (desired, supported) pair.
- UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
- if (USTRINGTRIE_HAS_NEXT(result)) {
- uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
- for (;;) {
- result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
- int32_t d;
- if (USTRINGTRIE_HAS_VALUE(result)) {
- d = iter.getValue();
- } else if (star) {
- d = 0;
- } else {
- d = getFallbackRegionDistance(iter, startState);
- star = true;
- }
- if (d > threshold) {
- return d;
- } else if (regionDistance < d) {
- regionDistance = d;
- }
- if ((supported = *supportedPartitions++) != 0) {
- iter.resetToState64(desState);
- } else {
- break;
- }
- }
- } else if (!star) {
- int32_t d = getFallbackRegionDistance(iter, startState);
- if (d > threshold) {
- return d;
- } else if (regionDistance < d) {
- regionDistance = d;
- }
- star = true;
- }
- if ((desired = *desiredPartitions++) != 0) {
- iter.resetToState64(startState);
- supportedPartitions = supportedStart;
- supported = *supportedPartitions++;
- } else {
- break;
- }
- }
- return regionDistance;
- }
- int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
- #if U_DEBUG
- UStringTrieResult result =
- #endif
- iter.resetToState64(startState).next(u'*'); // <*, *>
- U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
- int32_t distance = iter.getValue();
- U_ASSERT(distance >= 0);
- return distance;
- }
- int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
- uint8_t c;
- if ((c = *s) == 0) {
- return -1; // no empty subtags in the distance data
- }
- for (;;) {
- c = uprv_invCharToAscii(c);
- // EBCDIC: If *s is not an invariant character,
- // then c is now 0 and will simply not match anything, which is harmless.
- uint8_t next = *++s;
- if (next != 0) {
- if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
- return -1;
- }
- } else {
- // last character of this subtag
- UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
- if (wantValue) {
- if (USTRINGTRIE_HAS_VALUE(result)) {
- int32_t value = iter.getValue();
- if (result == USTRINGTRIE_FINAL_VALUE) {
- value |= DISTANCE_IS_FINAL;
- }
- return value;
- }
- } else {
- if (USTRINGTRIE_HAS_NEXT(result)) {
- return 0;
- }
- }
- return -1;
- }
- c = next;
- }
- }
- bool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
- // Linear search for a very short list (length 6 as of 2019),
- // because we look for equivalence not equality, and
- // because it's easy.
- // If there are many paradigm LSRs we should use a hash set
- // with custom comparator and hasher.
- U_ASSERT(paradigmLSRsLength <= 15);
- for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
- if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
- }
- return false;
- }
- U_NAMESPACE_END
|