#pragma once #include "decomposition_table.h" #include #include #include #include #include #include #include #include namespace NUnicode { enum ENormalization { NFD, NFC, NFKD, NFKC, }; // Грубо говоря: // NFD расскладывает "ё" на "е + диакритику" // NFC сначала всё раскладывает, потом всё что может - складывает // NFKD делает то же, что и NFD. Кроме того, например, римскую IV (\x2163) // превращает в латинские I и V // NFKC - NFKD + композиция (римская четвёрка из I и V, естественно, не образуется) // Формальная спецификация: http://www.unicode.org/reports/tr15/ namespace NPrivate { inline const wchar32* Decomposition(const TDecompositionTable& table, wchar32 ch) { return table.Get(ch, static_cast(nullptr)); } class TDecompositor { private: const TDecompositionTable& Table; public: inline TDecompositor(const TDecompositionTable& table) : Table(table) { } inline const wchar32* Decomposition(wchar32 ch) const { return NPrivate::Decomposition(Table, ch); } }; template struct TStandartDecompositor: public TDecompositor { TStandartDecompositor() : TDecompositor(NPrivate::DecompositionTable()) { } }; template struct TShift; template <> struct TShift { static const WC_TYPE Value = NFD_QC; }; template <> struct TShift { static const WC_TYPE Value = NFC_QC; }; template <> struct TShift { static const WC_TYPE Value = NFKD_QC; }; template <> struct TShift { static const WC_TYPE Value = NFKC_QC; }; template inline bool Normalized(wchar32 ch) { return CharInfo(ch) & NPrivate::TShift::Value; } class TComposition { private: struct TRawData { wchar32 Lead; wchar32 Tail; wchar32 Comp; }; static const TRawData RawData[]; static const size_t RawDataSize; class TKey: public std::pair { public: inline TKey(wchar32 a, wchar32 b) : std::pair(a, b) { } inline size_t Hash() const { return CombineHashes(first, second); } }; template struct THash { inline size_t operator()(const T& t) const { return t.Hash(); } }; typedef THashMap> TData; TData Data; public: TComposition(); inline wchar32 Composite(wchar32 lead, wchar32 tail) const { TData::const_iterator i = Data.find(TKey(lead, tail)); if (i == Data.end()) return 0; return i->second; } }; typedef std::pair TSymbol; typedef TVector TBuffer; template class TCompositor; template <> class TCompositor { public: inline void DoComposition(TBuffer& buffer) { Y_UNUSED(buffer); } }; template <> class TCompositor { private: static const wchar32 NonComposite = 0; const TComposition* Composition; public: inline TCompositor() : Composition(Singleton()) { } inline void DoComposition(TBuffer& buffer) { if (buffer.size() < 2) return; const TSymbol& leadSymbol = buffer[0]; if (leadSymbol.second != 0) return; wchar32 lead = leadSymbol.first; bool oneMoreTurnPlease = false; do { oneMoreTurnPlease = false; TCombining lastCombining = 0; for (TBuffer::iterator i = buffer.begin() + 1, mi = buffer.end(); i != mi; ++i) { TCombining currentCombining = i->second; if (!(currentCombining != lastCombining && currentCombining != 0 || lastCombining == 0 && currentCombining == 0)) continue; lastCombining = currentCombining; wchar32 comb = Composition->Composite(lead, i->first); if (comb == NonComposite) continue; lead = comb; buffer.erase(i); oneMoreTurnPlease = true; break; } } while (oneMoreTurnPlease); Y_ASSERT(DecompositionCombining(lead) == 0); buffer[0] = TSymbol(lead, 0); } }; template inline bool Normalized(const TCharType* begin, const TCharType* end) { TCombining lastCanonicalClass = 0; for (const TCharType* i = begin; i != end;) { wchar32 ch = ReadSymbolAndAdvance(i, end); TCombining canonicalClass = DecompositionCombining(ch); if (lastCanonicalClass > canonicalClass && canonicalClass != 0) return false; if (!Normalized(ch)) return false; lastCanonicalClass = canonicalClass; } return true; } } template inline const wchar32* Decomposition(wchar32 ch) { return NPrivate::Decomposition(NPrivate::DecompositionTable(), ch); } template class TNormalizer : NNonCopyable::TNonCopyable { private: static const ENormalization Norm = N; static const bool IsCompat = Norm == NFKD || Norm == NFKC; static const bool RequireComposition = Norm == NFC || Norm == NFKC; typedef NPrivate::TSymbol TSymbol; typedef NPrivate::TBuffer TBuffer; TBuffer Buffer; NPrivate::TCompositor Compositor; const TDecompositor& Decompositor; private: static inline bool Compare(const TSymbol& a, const TSymbol& b) { return a.second < b.second; } struct TComparer { inline bool operator()(const TSymbol& a, const TSymbol& b) { return Compare(a, b); } }; template static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, T& out) { for (TBuffer::const_iterator i = begin; i != end; ++i) { WriteSymbol(i->first, out); } } static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, TUtf32String& out) { // because WriteSymbol from util/charset/wide.h works wrong in this case for (TBuffer::const_iterator i = begin; i != end; ++i) { out += i->first; } } inline void SortBuffer() { if (Buffer.size() < 2) return; StableSort(Buffer.begin(), Buffer.end(), TComparer()); } template inline void AddCharNoDecomposition(wchar32 c, T& out) { TCombining cc = DecompositionCombining(c); if (cc == 0) { SortBuffer(); Buffer.push_back(TBuffer::value_type(c, cc)); Compositor.DoComposition(Buffer); if (Buffer.size() > 1) { Write(Buffer.begin(), Buffer.end() - 1, out); Buffer.erase(Buffer.begin(), Buffer.end() - 1); // TODO I don't like this } } else { Buffer.push_back(TBuffer::value_type(c, cc)); } } template inline void AddChar(wchar32 c, T& out) { const wchar32* decompBegin = Decompositor.Decomposition(c); if (decompBegin) { while (*decompBegin) { Y_ASSERT(Decompositor.Decomposition(*decompBegin) == nullptr); AddCharNoDecomposition(*(decompBegin++), out); } return; } else { AddCharNoDecomposition(c, out); } } template inline void DoNormalize(const TCharType* begin, const TCharType* end, T& out) { Buffer.clear(); for (const TCharType* i = begin; i != end;) { AddChar(ReadSymbolAndAdvance(i, end), out); } SortBuffer(); Compositor.DoComposition(Buffer); Write(Buffer.begin(), Buffer.end(), out); } public: TNormalizer() : Decompositor(*Singleton>()) { } TNormalizer(const TDecompositor& decompositor) : Decompositor(decompositor) { } template inline void Normalize(const TCharType* begin, const TCharType* end, T& out) { if (NPrivate::Normalized(begin, end)) { for (const TCharType* i = begin; i != end; ++i) { WriteSymbol(*i, out); } } else { DoNormalize(begin, end, out); } } template inline void Normalize(const TCharType* begin, const TCharType* end, TUtf32String& out) { if (NPrivate::Normalized(begin, end)) { for (const TCharType* i = begin; i != end;) { out += ReadSymbolAndAdvance(i, end); } } else { DoNormalize(begin, end, out); } } template inline void Normalize(const TCharType* begin, size_t len, T& out) { return Normalize(begin, begin + len, out); } template inline TBasicString Normalize(const TBasicString& src) { if (NPrivate::Normalized(src.begin(), src.end())) { // nothing to normalize return src; } else { TBasicString res; res.reserve(src.length()); DoNormalize(src.begin(), src.end(), res); return res; } } }; } //! decompose utf16 or utf32 string to any container supporting push_back or to T* template inline void Normalize(const TCharType* begin, size_t len, T& out) { ::NUnicode::TNormalizer dec; dec.Normalize(begin, len, out); } template inline TBasicString Normalize(const TCharType* str, size_t len) { TBasicString res; res.reserve(len); Normalize(str, len, res); return res; } template inline TBasicString Normalize(const TBasicString& str) { ::NUnicode::TNormalizer dec; return dec.Normalize(str); } template inline TBasicString Normalize(const TBasicStringBuf str) { return Normalize(str.data(), str.size()); }