123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- #pragma once
- #include "decomposition_table.h"
- #include <util/charset/unidata.h>
- #include <util/charset/wide.h>
- #include <util/generic/hash.h>
- #include <util/generic/vector.h>
- #include <util/generic/algorithm.h>
- #include <util/generic/singleton.h>
- #include <util/generic/noncopyable.h>
- #include <utility>
- namespace NUnicode {
- enum ENormalization {
- NFD,
- NFC,
- NFKD,
- NFKC,
- };
- // Грубо говоря:
- // NFD расскладывает "ё" на "е + диакритику"
- // NFC сначала всё раскладывает, потом всё что может - складывает
- // NFKD делает то же, что и NFD. Кроме того, например, римскую IV (\x2163)
- // превращает в латинские I и V
- // NFKC - NFKD + композиция (римская четвёрка из I и V, естественно, не образуется)
- // Формальная спецификация: http://www.unicode.org/reports/tr15/
- namespace NPrivate {
- inline const wchar32* Decomposition(const TDecompositionTable& table, wchar32 ch) {
- return table.Get(ch, static_cast<const wchar32*>(nullptr));
- }
- class TDecompositor {
- private:
- const TDecompositionTable& Table;
- public:
- inline TDecompositor(const TDecompositionTable& table)
- : Table(table)
- {
- }
- inline const wchar32* Decomposition(wchar32 ch) const {
- return NPrivate::Decomposition(Table, ch);
- }
- };
- template <bool IsCompat>
- struct TStandartDecompositor: public TDecompositor {
- TStandartDecompositor()
- : TDecompositor(NPrivate::DecompositionTable<IsCompat>())
- {
- }
- };
- template <ENormalization N>
- struct TShift;
- template <>
- struct TShift<NFD> {
- static const WC_TYPE Value = NFD_QC;
- };
- template <>
- struct TShift<NFC> {
- static const WC_TYPE Value = NFC_QC;
- };
- template <>
- struct TShift<NFKD> {
- static const WC_TYPE Value = NFKD_QC;
- };
- template <>
- struct TShift<NFKC> {
- static const WC_TYPE Value = NFKC_QC;
- };
- template <ENormalization N>
- inline bool Normalized(wchar32 ch) {
- return CharInfo(ch) & NPrivate::TShift<N>::Value;
- }
- class TComposition {
- private:
- struct TRawData {
- wchar32 Lead;
- wchar32 Tail;
- wchar32 Comp;
- };
- static const TRawData RawData[];
- static const size_t RawDataSize;
- class TKey: public std::pair<wchar32, wchar32> {
- public:
- inline TKey(wchar32 a, wchar32 b)
- : std::pair<wchar32, wchar32>(a, b)
- {
- }
- inline size_t Hash() const {
- return CombineHashes(first, second);
- }
- };
- template <class T>
- struct THash {
- inline size_t operator()(const T& t) const {
- return t.Hash();
- }
- };
- typedef THashMap<TKey, wchar32, THash<TKey>> TData;
- TData Data;
- public:
- TComposition();
- inline wchar32 Composite(wchar32 lead, wchar32 tail) const {
- TData::const_iterator i = Data.find(TKey(lead, tail));
- if (i == Data.end())
- return 0;
- return i->second;
- }
- };
- typedef std::pair<wchar32, TCombining> TSymbol;
- typedef TVector<TSymbol> TBuffer;
- template <bool doCompose>
- class TCompositor;
- template <>
- class TCompositor<false> {
- public:
- inline void DoComposition(TBuffer& buffer) {
- Y_UNUSED(buffer);
- }
- };
- template <>
- class TCompositor<true> {
- private:
- static const wchar32 NonComposite = 0;
- const TComposition* Composition;
- public:
- inline TCompositor()
- : Composition(Singleton<TComposition>())
- {
- }
- inline void DoComposition(TBuffer& buffer) {
- if (buffer.size() < 2)
- return;
- const TSymbol& leadSymbol = buffer[0];
- if (leadSymbol.second != 0)
- return;
- wchar32 lead = leadSymbol.first;
- bool oneMoreTurnPlease = false;
- do {
- oneMoreTurnPlease = false;
- TCombining lastCombining = 0;
- for (TBuffer::iterator i = buffer.begin() + 1, mi = buffer.end(); i != mi; ++i) {
- TCombining currentCombining = i->second;
- if (!(currentCombining != lastCombining && currentCombining != 0 || lastCombining == 0 && currentCombining == 0))
- continue;
- lastCombining = currentCombining;
- wchar32 comb = Composition->Composite(lead, i->first);
- if (comb == NonComposite)
- continue;
- lead = comb;
- buffer.erase(i);
- oneMoreTurnPlease = true;
- break;
- }
- } while (oneMoreTurnPlease);
- Y_ASSERT(DecompositionCombining(lead) == 0);
- buffer[0] = TSymbol(lead, 0);
- }
- };
- template <ENormalization N, typename TCharType>
- inline bool Normalized(const TCharType* begin, const TCharType* end) {
- TCombining lastCanonicalClass = 0;
- for (const TCharType* i = begin; i != end;) {
- wchar32 ch = ReadSymbolAndAdvance(i, end);
- TCombining canonicalClass = DecompositionCombining(ch);
- if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
- return false;
- if (!Normalized<N>(ch))
- return false;
- lastCanonicalClass = canonicalClass;
- }
- return true;
- }
- }
- template <bool compat>
- inline const wchar32* Decomposition(wchar32 ch) {
- return NPrivate::Decomposition(NPrivate::DecompositionTable<compat>(), ch);
- }
- template <ENormalization N, class TDecompositor = NPrivate::TDecompositor>
- class TNormalizer : NNonCopyable::TNonCopyable {
- private:
- static const ENormalization Norm = N;
- static const bool IsCompat = Norm == NFKD || Norm == NFKC;
- static const bool RequireComposition = Norm == NFC || Norm == NFKC;
- typedef NPrivate::TSymbol TSymbol;
- typedef NPrivate::TBuffer TBuffer;
- TBuffer Buffer;
- NPrivate::TCompositor<RequireComposition> Compositor;
- const TDecompositor& Decompositor;
- private:
- static inline bool Compare(const TSymbol& a, const TSymbol& b) {
- return a.second < b.second;
- }
- struct TComparer {
- inline bool operator()(const TSymbol& a, const TSymbol& b) {
- return Compare(a, b);
- }
- };
- template <class T>
- static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, T& out) {
- for (TBuffer::const_iterator i = begin; i != end; ++i) {
- WriteSymbol(i->first, out);
- }
- }
- static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, TUtf32String& out) { // because WriteSymbol from util/charset/wide.h works wrong in this case
- for (TBuffer::const_iterator i = begin; i != end; ++i) {
- out += i->first;
- }
- }
- inline void SortBuffer() {
- if (Buffer.size() < 2)
- return;
- StableSort(Buffer.begin(), Buffer.end(), TComparer());
- }
- template <class T>
- inline void AddCharNoDecomposition(wchar32 c, T& out) {
- TCombining cc = DecompositionCombining(c);
- if (cc == 0) {
- SortBuffer();
- Buffer.push_back(TBuffer::value_type(c, cc));
- Compositor.DoComposition(Buffer);
- if (Buffer.size() > 1) {
- Write(Buffer.begin(), Buffer.end() - 1, out);
- Buffer.erase(Buffer.begin(), Buffer.end() - 1); // TODO I don't like this
- }
- } else {
- Buffer.push_back(TBuffer::value_type(c, cc));
- }
- }
- template <class T>
- inline void AddChar(wchar32 c, T& out) {
- const wchar32* decompBegin = Decompositor.Decomposition(c);
- if (decompBegin) {
- while (*decompBegin) {
- Y_ASSERT(Decompositor.Decomposition(*decompBegin) == nullptr);
- AddCharNoDecomposition(*(decompBegin++), out);
- }
- return;
- } else {
- AddCharNoDecomposition(c, out);
- }
- }
- template <class T, typename TCharType>
- inline void DoNormalize(const TCharType* begin, const TCharType* end, T& out) {
- Buffer.clear();
- for (const TCharType* i = begin; i != end;) {
- AddChar(ReadSymbolAndAdvance(i, end), out);
- }
- SortBuffer();
- Compositor.DoComposition(Buffer);
- Write(Buffer.begin(), Buffer.end(), out);
- }
- public:
- TNormalizer()
- : Decompositor(*Singleton<NPrivate::TStandartDecompositor<IsCompat>>())
- {
- }
- TNormalizer(const TDecompositor& decompositor)
- : Decompositor(decompositor)
- {
- }
- template <class T, typename TCharType>
- inline void Normalize(const TCharType* begin, const TCharType* end, T& out) {
- if (NPrivate::Normalized<Norm>(begin, end)) {
- for (const TCharType* i = begin; i != end; ++i) {
- WriteSymbol(*i, out);
- }
- } else {
- DoNormalize(begin, end, out);
- }
- }
- template <typename TCharType>
- inline void Normalize(const TCharType* begin, const TCharType* end, TUtf32String& out) {
- if (NPrivate::Normalized<Norm>(begin, end)) {
- for (const TCharType* i = begin; i != end;) {
- out += ReadSymbolAndAdvance(i, end);
- }
- } else {
- DoNormalize(begin, end, out);
- }
- }
- template <class T, typename TCharType>
- inline void Normalize(const TCharType* begin, size_t len, T& out) {
- return Normalize(begin, begin + len, out);
- }
- template <typename TCharType>
- inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& src) {
- if (NPrivate::Normalized<Norm>(src.begin(), src.end())) {
- // nothing to normalize
- return src;
- } else {
- TBasicString<TCharType> res;
- res.reserve(src.length());
- DoNormalize(src.begin(), src.end(), res);
- return res;
- }
- }
- };
- }
- //! decompose utf16 or utf32 string to any container supporting push_back or to T*
- template <NUnicode::ENormalization Norm, class T, typename TCharType>
- inline void Normalize(const TCharType* begin, size_t len, T& out) {
- ::NUnicode::TNormalizer<Norm> dec;
- dec.Normalize(begin, len, out);
- }
- template <NUnicode::ENormalization N, typename TCharType>
- inline TBasicString<TCharType> Normalize(const TCharType* str, size_t len) {
- TBasicString<TCharType> res;
- res.reserve(len);
- Normalize<N>(str, len, res);
- return res;
- }
- template <NUnicode::ENormalization N, typename TCharType>
- inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& str) {
- ::NUnicode::TNormalizer<N> dec;
- return dec.Normalize(str);
- }
- template <NUnicode::ENormalization N, typename TCharType>
- inline TBasicString<TCharType> Normalize(const TBasicStringBuf<TCharType> str) {
- return Normalize<N>(str.data(), str.size());
- }
|