123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- * Copyright (C) 2010-2012, International Business Machines
- * Corporation and others. All Rights Reserved.
- *******************************************************************************
- * file name: ucharstriebuilder.h
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2010nov14
- * created by: Markus W. Scherer
- */
- #include "unicode/utypes.h"
- #include "unicode/ucharstrie.h"
- #include "unicode/ucharstriebuilder.h"
- #include "unicode/unistr.h"
- #include "unicode/ustring.h"
- #include "cmemory.h"
- #include "uarrsort.h"
- #include "uassert.h"
- #include "uhash.h"
- #include "ustr_imp.h"
- U_NAMESPACE_BEGIN
- /*
- * Note: This builder implementation stores (string, value) pairs with full copies
- * of the 16-bit-unit sequences, until the UCharsTrie is built.
- * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
- */
- class UCharsTrieElement : public UMemory {
- public:
- // Use compiler's default constructor, initializes nothing.
- void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
- UnicodeString getString(const UnicodeString &strings) const {
- int32_t length=strings[stringOffset];
- return strings.tempSubString(stringOffset+1, length);
- }
- int32_t getStringLength(const UnicodeString &strings) const {
- return strings[stringOffset];
- }
- char16_t charAt(int32_t index, const UnicodeString &strings) const {
- return strings[stringOffset+1+index];
- }
- int32_t getValue() const { return value; }
- int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
- private:
- // The first strings unit contains the string length.
- // (Compared with a stringLength field here, this saves 2 bytes per string.)
- int32_t stringOffset;
- int32_t value;
- };
- void
- UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
- UnicodeString &strings, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
- return;
- }
- int32_t length=s.length();
- if(length>0xffff) {
- // Too long: We store the length in 1 unit.
- errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return;
- }
- stringOffset=strings.length();
- strings.append(static_cast<char16_t>(length));
- value=val;
- strings.append(s);
- }
- int32_t
- UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
- return getString(strings).compare(other.getString(strings));
- }
- UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
- : elements(nullptr), elementsCapacity(0), elementsLength(0),
- uchars(nullptr), ucharsCapacity(0), ucharsLength(0) {}
- UCharsTrieBuilder::~UCharsTrieBuilder() {
- delete[] elements;
- uprv_free(uchars);
- }
- UCharsTrieBuilder &
- UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
- return *this;
- }
- if(ucharsLength>0) {
- // Cannot add elements after building.
- errorCode=U_NO_WRITE_PERMISSION;
- return *this;
- }
- if(elementsLength==elementsCapacity) {
- int32_t newCapacity;
- if(elementsCapacity==0) {
- newCapacity=1024;
- } else {
- newCapacity=4*elementsCapacity;
- }
- UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
- if(newElements==nullptr) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- return *this;
- }
- if(elementsLength>0) {
- uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement));
- }
- delete[] elements;
- elements=newElements;
- elementsCapacity=newCapacity;
- }
- elements[elementsLength++].setTo(s, value, strings, errorCode);
- if(U_SUCCESS(errorCode) && strings.isBogus()) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- }
- return *this;
- }
- U_CDECL_BEGIN
- static int32_t U_CALLCONV
- compareElementStrings(const void *context, const void *left, const void *right) {
- const UnicodeString *strings=static_cast<const UnicodeString *>(context);
- const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
- const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
- return leftElement->compareStringTo(*rightElement, *strings);
- }
- U_CDECL_END
- UCharsTrie *
- UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
- buildUChars(buildOption, errorCode);
- UCharsTrie *newTrie=nullptr;
- if(U_SUCCESS(errorCode)) {
- newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
- if(newTrie==nullptr) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- } else {
- uchars=nullptr; // The new trie now owns the array.
- ucharsCapacity=0;
- }
- }
- return newTrie;
- }
- UnicodeString &
- UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
- UErrorCode &errorCode) {
- buildUChars(buildOption, errorCode);
- if(U_SUCCESS(errorCode)) {
- result.setTo(false, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
- }
- return result;
- }
- void
- UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
- return;
- }
- if(uchars!=nullptr && ucharsLength>0) {
- // Already built.
- return;
- }
- if(ucharsLength==0) {
- if(elementsLength==0) {
- errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return;
- }
- if(strings.isBogus()) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(UCharsTrieElement)),
- compareElementStrings, &strings,
- false, // need not be a stable sort
- &errorCode);
- if(U_FAILURE(errorCode)) {
- return;
- }
- // Duplicate strings are not allowed.
- UnicodeString prev=elements[0].getString(strings);
- for(int32_t i=1; i<elementsLength; ++i) {
- UnicodeString current=elements[i].getString(strings);
- if(prev==current) {
- errorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- prev.fastCopyFrom(current);
- }
- }
- // Create and char16_t-serialize the trie for the elements.
- ucharsLength=0;
- int32_t capacity=strings.length();
- if(capacity<1024) {
- capacity=1024;
- }
- if(ucharsCapacity<capacity) {
- uprv_free(uchars);
- uchars=static_cast<char16_t *>(uprv_malloc(capacity*2));
- if(uchars==nullptr) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- ucharsCapacity=0;
- return;
- }
- ucharsCapacity=capacity;
- }
- StringTrieBuilder::build(buildOption, elementsLength, errorCode);
- if(uchars==nullptr) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- }
- }
- int32_t
- UCharsTrieBuilder::getElementStringLength(int32_t i) const {
- return elements[i].getStringLength(strings);
- }
- char16_t
- UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
- return elements[i].charAt(unitIndex, strings);
- }
- int32_t
- UCharsTrieBuilder::getElementValue(int32_t i) const {
- return elements[i].getValue();
- }
- int32_t
- UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
- const UCharsTrieElement &firstElement=elements[first];
- const UCharsTrieElement &lastElement=elements[last];
- int32_t minStringLength=firstElement.getStringLength(strings);
- while(++unitIndex<minStringLength &&
- firstElement.charAt(unitIndex, strings)==
- lastElement.charAt(unitIndex, strings)) {}
- return unitIndex;
- }
- int32_t
- UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
- int32_t length=0; // Number of different units at unitIndex.
- int32_t i=start;
- do {
- char16_t unit=elements[i++].charAt(unitIndex, strings);
- while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
- ++i;
- }
- ++length;
- } while(i<limit);
- return length;
- }
- int32_t
- UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
- do {
- char16_t unit=elements[i++].charAt(unitIndex, strings);
- while(unit==elements[i].charAt(unitIndex, strings)) {
- ++i;
- }
- } while(--count>0);
- return i;
- }
- int32_t
- UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const {
- while(unit==elements[i].charAt(unitIndex, strings)) {
- ++i;
- }
- return i;
- }
- UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const char16_t *units, int32_t len, Node *nextNode)
- : LinearMatchNode(len, nextNode), s(units) {
- hash=hash*37u+ustr_hashUCharsN(units, len);
- }
- bool
- UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
- if(this==&other) {
- return true;
- }
- if(!LinearMatchNode::operator==(other)) {
- return false;
- }
- const UCTLinearMatchNode &o=static_cast<const UCTLinearMatchNode &>(other);
- return 0==u_memcmp(s, o.s, length);
- }
- void
- UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
- UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
- next->write(builder);
- b.write(s, length);
- offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
- }
- StringTrieBuilder::Node *
- UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
- Node *nextNode) const {
- return new UCTLinearMatchNode(
- elements[i].getString(strings).getBuffer()+unitIndex,
- length,
- nextNode);
- }
- UBool
- UCharsTrieBuilder::ensureCapacity(int32_t length) {
- if(uchars==nullptr) {
- return false; // previous memory allocation had failed
- }
- if(length>ucharsCapacity) {
- int32_t newCapacity=ucharsCapacity;
- do {
- newCapacity*=2;
- } while(newCapacity<=length);
- char16_t *newUChars=static_cast<char16_t *>(uprv_malloc(newCapacity*2));
- if(newUChars==nullptr) {
- // unable to allocate memory
- uprv_free(uchars);
- uchars=nullptr;
- ucharsCapacity=0;
- return false;
- }
- u_memcpy(newUChars+(newCapacity-ucharsLength),
- uchars+(ucharsCapacity-ucharsLength), ucharsLength);
- uprv_free(uchars);
- uchars=newUChars;
- ucharsCapacity=newCapacity;
- }
- return true;
- }
- int32_t
- UCharsTrieBuilder::write(int32_t unit) {
- int32_t newLength=ucharsLength+1;
- if(ensureCapacity(newLength)) {
- ucharsLength=newLength;
- uchars[ucharsCapacity - ucharsLength] = static_cast<char16_t>(unit);
- }
- return ucharsLength;
- }
- int32_t
- UCharsTrieBuilder::write(const char16_t *s, int32_t length) {
- int32_t newLength=ucharsLength+length;
- if(ensureCapacity(newLength)) {
- ucharsLength=newLength;
- u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
- }
- return ucharsLength;
- }
- int32_t
- UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
- return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
- }
- int32_t
- UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
- if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
- return write(i|(isFinal<<15));
- }
- char16_t intUnits[3];
- int32_t length;
- if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
- intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitValueLead);
- intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(i) >> 16);
- intUnits[2] = static_cast<char16_t>(i);
- length=3;
- // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
- // intUnits[0]=(char16_t)(i);
- // length=1;
- } else {
- intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitValueLead + (i >> 16));
- intUnits[1] = static_cast<char16_t>(i);
- length=2;
- }
- intUnits[0] = static_cast<char16_t>(intUnits[0] | (isFinal << 15));
- return write(intUnits, length);
- }
- int32_t
- UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
- if(!hasValue) {
- return write(node);
- }
- char16_t intUnits[3];
- int32_t length;
- if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
- intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitNodeValueLead);
- intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(value) >> 16);
- intUnits[2] = static_cast<char16_t>(value);
- length=3;
- } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
- intUnits[0] = static_cast<char16_t>((value + 1) << 6);
- length=1;
- } else {
- intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitNodeValueLead + ((value >> 10) & 0x7fc0));
- intUnits[1] = static_cast<char16_t>(value);
- length=2;
- }
- intUnits[0] |= static_cast<char16_t>(node);
- return write(intUnits, length);
- }
- int32_t
- UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
- int32_t i=ucharsLength-jumpTarget;
- U_ASSERT(i>=0);
- if(i<=UCharsTrie::kMaxOneUnitDelta) {
- return write(i);
- }
- char16_t intUnits[3];
- int32_t length;
- if(i<=UCharsTrie::kMaxTwoUnitDelta) {
- intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitDeltaLead + (i >> 16));
- length=1;
- } else {
- intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitDeltaLead);
- intUnits[1] = static_cast<char16_t>(i >> 16);
- length=2;
- }
- intUnits[length++] = static_cast<char16_t>(i);
- return write(intUnits, length);
- }
- U_NAMESPACE_END
|