Просмотр исходного кода

Restoring authorship annotation for <grig@yandex-team.ru>. Commit 2 of 2.

grig 3 лет назад
Родитель
Сommit
beb63ece3a

+ 105 - 105
library/cpp/charset/codepage.cpp

@@ -264,10 +264,10 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
         TxChar* d;
 
         for (s = d = str; s < e;) {
-            size_t l = 0; 
+            size_t l = 0;
 
-            if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) { 
-                d++, s += l; 
+            if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) {
+                d++, s += l;
             } else {
                 *d++ = BROKEN_RUNE;
                 ++s;
@@ -317,7 +317,7 @@ void DecodeUnknownPlane(wchar16* str, wchar16*& ee, const ECharset enc) {
 void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) {
     DoDecodeUnknownPlane(str, ee, enc);
 }
- 
+
 namespace {
     class THashSetType: public THashSet<TString> {
     public:
@@ -332,37 +332,37 @@ namespace {
 }
 
 class TWindowsPrefixesHashSet: public THashSetType {
-public: 
-    inline TWindowsPrefixesHashSet() { 
+public:
+    inline TWindowsPrefixesHashSet() {
         Add("win");
         Add("wincp");
         Add("window");
-        Add("windowcp"); 
+        Add("windowcp");
         Add("windows");
         Add("windowscp");
-        Add("ansi"); 
-        Add("ansicp"); 
-    } 
-}; 
- 
+        Add("ansi");
+        Add("ansicp");
+    }
+};
+
 class TCpPrefixesHashSet: public THashSetType {
-public: 
-    inline TCpPrefixesHashSet() { 
-        Add("microsoft"); 
-        Add("microsoftcp"); 
-        Add("cp"); 
-    } 
-}; 
- 
+public:
+    inline TCpPrefixesHashSet() {
+        Add("microsoft");
+        Add("microsoftcp");
+        Add("cp");
+    }
+};
+
 class TIsoPrefixesHashSet: public THashSetType {
-public: 
-    inline TIsoPrefixesHashSet() { 
-        Add("iso"); 
-        Add("isolatin"); 
-        Add("latin"); 
-    } 
-}; 
- 
+public:
+    inline TIsoPrefixesHashSet() {
+        Add("iso");
+        Add("isolatin");
+        Add("latin");
+    }
+};
+
 class TLatinToIsoHash: public THashMap<const char*, TString, ci_hash, ci_equal_to> {
 public:
     inline TLatinToIsoHash() {
@@ -377,57 +377,57 @@ public:
         insert(value_type("latin9", "iso-8859-15"));
         insert(value_type("latin10", "iso-8859-16"));
     }
-}; 
- 
+};
+
 static inline void NormalizeEncodingPrefixes(TString& enc) {
-    size_t preflen = enc.find_first_of("0123456789"); 
+    size_t preflen = enc.find_first_of("0123456789");
     if (preflen == TString::npos)
-        return; 
- 
+        return;
+
     TString prefix = enc.substr(0, preflen);
-    for (size_t i = 0; i < prefix.length(); ++i) { 
-        if (prefix[i] == '-') { 
-            prefix.remove(i--); 
-        } 
-    } 
- 
+    for (size_t i = 0; i < prefix.length(); ++i) {
+        if (prefix[i] == '-') {
+            prefix.remove(i--);
+        }
+    }
+
     if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) {
-        enc.remove(0, preflen); 
-        enc.prepend("windows-"); 
-        return; 
-    } 
- 
+        enc.remove(0, preflen);
+        enc.prepend("windows-");
+        return;
+    }
+
     if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) {
         if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) {
-            enc.remove(0, preflen); 
-            enc.prepend("windows-"); 
-            return; 
-        } 
-        enc.remove(0, preflen); 
-        enc.prepend("cp"); 
-        return; 
-    } 
- 
+            enc.remove(0, preflen);
+            enc.prepend("windows-");
+            return;
+        }
+        enc.remove(0, preflen);
+        enc.prepend("cp");
+        return;
+    }
+
     if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) {
-        if (enc.length() == preflen + 1 || enc.length() == preflen + 2) { 
+        if (enc.length() == preflen + 1 || enc.length() == preflen + 2) {
             TString enccopy = enc.substr(preflen);
-            enccopy.prepend("latin"); 
-            const TLatinToIsoHash* latinhash = Singleton<TLatinToIsoHash>(); 
+            enccopy.prepend("latin");
+            const TLatinToIsoHash* latinhash = Singleton<TLatinToIsoHash>();
             TLatinToIsoHash::const_iterator it = latinhash->find(enccopy.data());
-            if (it != latinhash->end()) 
-                enc.assign(it->second); 
-            return; 
-        } else if (enc.length() > preflen + 5 && enc[preflen] == '8') { 
-            enc.remove(0, preflen); 
-            enc.prepend("iso-"); 
-            return; 
-        } 
-    } 
-} 
- 
+            if (it != latinhash->end())
+                enc.assign(it->second);
+            return;
+        } else if (enc.length() > preflen + 5 && enc[preflen] == '8') {
+            enc.remove(0, preflen);
+            enc.prepend("iso-");
+            return;
+        }
+    }
+}
+
 class TEncodingNamesHashSet: public THashSetType {
-public: 
-    TEncodingNamesHashSet() { 
+public:
+    TEncodingNamesHashSet() {
         Add("iso-8859-1");
         Add("iso-8859-2");
         Add("iso-8859-3");
@@ -468,44 +468,44 @@ public:
         Add("big5");
         Add("tis-620");
         Add("tis620");
-    } 
-}; 
- 
+    }
+};
+
 ECharset EncodingHintByName(const char* encname) {
-    if (!encname) 
-        return CODES_UNKNOWN; // safety check 
- 
-    // Common trouble: spurious "charset=" in the encoding name 
-    if (!strnicmp(encname, "charset=", 8)) { 
-        encname += 8; 
-    } 
- 
-    // Strip everything up to the first alphanumeric, and after the last one 
-    while (*encname && !isalnum(*encname)) 
-        ++encname; 
- 
-    if (!*encname) 
-        return CODES_UNKNOWN; 
- 
-    const char* lastpos = encname + strlen(encname) - 1; 
-    while (lastpos > encname && !isalnum(*lastpos)) 
-        --lastpos; 
- 
-    // Do some normalization 
+    if (!encname)
+        return CODES_UNKNOWN; // safety check
+
+    // Common trouble: spurious "charset=" in the encoding name
+    if (!strnicmp(encname, "charset=", 8)) {
+        encname += 8;
+    }
+
+    // Strip everything up to the first alphanumeric, and after the last one
+    while (*encname && !isalnum(*encname))
+        ++encname;
+
+    if (!*encname)
+        return CODES_UNKNOWN;
+
+    const char* lastpos = encname + strlen(encname) - 1;
+    while (lastpos > encname && !isalnum(*lastpos))
+        --lastpos;
+
+    // Do some normalization
     TString enc(encname, lastpos - encname + 1);
-    enc.to_lower(); 
+    enc.to_lower();
     for (char* p = enc.begin(); p != enc.end(); ++p) {
-        if (*p == ' ' || *p == '=' || *p == '_') 
-            *p = '-'; 
-    } 
- 
-    NormalizeEncodingPrefixes(enc); 
- 
+        if (*p == ' ' || *p == '=' || *p == '_')
+            *p = '-';
+    }
+
+    NormalizeEncodingPrefixes(enc);
+
     ECharset hint = CharsetByName(enc.c_str());
-    if (hint != CODES_UNKNOWN) 
-        return hint; 
- 
+    if (hint != CODES_UNKNOWN)
+        return hint;
+
     if (Singleton<TEncodingNamesHashSet>()->Has(enc))
-        return CODES_UNSUPPORTED; 
-    return CODES_UNKNOWN; 
-} 
+        return CODES_UNSUPPORTED;
+    return CODES_UNKNOWN;
+}

+ 102 - 102
library/cpp/charset/codepage_ut.cpp

@@ -6,7 +6,7 @@
 
 #include <util/charset/utf8.h>
 #include <util/system/yassert.h>
- 
+
 #if defined(_MSC_VER)
 #pragma warning(disable : 4309) /*truncation of constant value*/
 #endif
@@ -30,8 +30,8 @@ namespace {
 }
 
 class TCodepageTest: public TTestBase {
-private: 
-    UNIT_TEST_SUITE(TCodepageTest); 
+private:
+    UNIT_TEST_SUITE(TCodepageTest);
     UNIT_TEST(TestUTF);
     UNIT_TEST(TestUTFFromUnknownPlane);
     UNIT_TEST(TestBrokenMultibyte);
@@ -42,14 +42,14 @@ private:
     UNIT_TEST(TestUpperLower);
     UNIT_TEST(TestBrokenRune);
     UNIT_TEST(TestCanEncode);
-    UNIT_TEST_SUITE_END(); 
- 
-public: 
-    void TestUTF(); 
-    void TestUTFFromUnknownPlane(); 
+    UNIT_TEST_SUITE_END();
+
+public:
+    void TestUTF();
+    void TestUTFFromUnknownPlane();
     void TestBrokenMultibyte();
     void TestSurrogatePairs();
-    void TestEncodingHints(); 
+    void TestEncodingHints();
     void TestToLower();
     void TestToUpper();
 
@@ -71,55 +71,55 @@ public:
     void TestBrokenRune() {
         UNIT_ASSERT_VALUES_EQUAL(BROKEN_RUNE, 0xFFFDu);
     }
-}; 
- 
-UNIT_TEST_SUITE_REGISTRATION(TCodepageTest); 
- 
-void TCodepageTest::TestUTF() { 
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TCodepageTest);
+
+void TCodepageTest::TestUTF() {
     for (wchar32 i = 0; i <= 0x10FFFF; i++) {
-        unsigned char buffer[32]; 
+        unsigned char buffer[32];
         Zero(buffer);
-        size_t rune_len; 
-        size_t ref_len = 0; 
- 
-        if (i < 0x80) 
-            ref_len = 1; 
-        else if (i < 0x800) 
-            ref_len = 2; 
-        else if (i < 0x10000) 
-            ref_len = 3; 
-        else 
-            ref_len = 4; 
- 
+        size_t rune_len;
+        size_t ref_len = 0;
+
+        if (i < 0x80)
+            ref_len = 1;
+        else if (i < 0x800)
+            ref_len = 2;
+        else if (i < 0x10000)
+            ref_len = 3;
+        else
+            ref_len = 4;
+
         RECODE_RESULT res = SafeWriteUTF8Char(i, rune_len, buffer, buffer + 32);
-        UNIT_ASSERT(res == RECODE_OK); 
-        UNIT_ASSERT(rune_len == ref_len); 
- 
+        UNIT_ASSERT(res == RECODE_OK);
+        UNIT_ASSERT(rune_len == ref_len);
+
         res = SafeWriteUTF8Char(i, rune_len, buffer, buffer + ref_len - 1);
-        UNIT_ASSERT(res == RECODE_EOOUTPUT); 
- 
-        wchar32 rune; 
+        UNIT_ASSERT(res == RECODE_EOOUTPUT);
+
+        wchar32 rune;
         res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + 32);
-        UNIT_ASSERT(res == RECODE_OK); 
-        UNIT_ASSERT(rune == i); 
-        UNIT_ASSERT(rune_len == ref_len); 
- 
+        UNIT_ASSERT(res == RECODE_OK);
+        UNIT_ASSERT(rune == i);
+        UNIT_ASSERT(rune_len == ref_len);
+
         res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len - 1);
-        UNIT_ASSERT(res == RECODE_EOINPUT); 
- 
-        if (ref_len > 1) { 
+        UNIT_ASSERT(res == RECODE_EOINPUT);
+
+        if (ref_len > 1) {
             res = SafeReadUTF8Char(rune, rune_len, buffer + 1, buffer + ref_len);
-            UNIT_ASSERT(res == RECODE_BROKENSYMBOL); 
- 
-            buffer[1] |= 0xC0; 
+            UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
+
+            buffer[1] |= 0xC0;
             res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len);
-            UNIT_ASSERT(res == RECODE_BROKENSYMBOL); 
- 
-            buffer[1] &= 0x3F; 
+            UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
+
+            buffer[1] &= 0x3F;
             res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len);
-            UNIT_ASSERT(res == RECODE_BROKENSYMBOL); 
-        } 
-    } 
+            UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
+        }
+    }
     const char* badStrings[] = {
         "\xfe",
         "\xff",
@@ -158,8 +158,8 @@ void TCodepageTest::TestUTF() {
         RECODE_RESULT res = SafeReadUTF8Char(rune, len, p, p + strlen(badStrings[i]));
         UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
     }
-} 
- 
+}
+
 void TCodepageTest::TestBrokenMultibyte() {
     const ECharset cp = CODES_EUC_JP;
 
@@ -180,24 +180,24 @@ void TCodepageTest::TestBrokenMultibyte() {
     UNIT_ASSERT(nread == Y_ARRAY_SIZE(bigSample));
 }
 
-void TCodepageTest::TestUTFFromUnknownPlane() { 
+void TCodepageTest::TestUTFFromUnknownPlane() {
     static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20,
                                          0x430, 0x431, 0x432, 0x20,
                                          0x1001, 0x1002, 0x1003, 0x20,
                                          0x10001, 0x10002, 0x10003};
- 
-    static const size_t BUFFER_SIZE = 1024; 
-    char bytebuffer[BUFFER_SIZE]; 
- 
-    size_t readchars = 0; 
-    size_t writtenbytes = 0; 
+
+    static const size_t BUFFER_SIZE = 1024;
+    char bytebuffer[BUFFER_SIZE];
+
+    size_t readchars = 0;
+    size_t writtenbytes = 0;
     size_t samplelen = Y_ARRAY_SIZE(sampletext);
 
     RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes);
- 
+
     UNIT_ASSERT(res == RECODE_OK);
     UNIT_ASSERT(samplelen == readchars);
- 
+
     size_t writtenbytes2 = 0;
     char bytebuffer2[BUFFER_SIZE];
     for (size_t i = 0; i != samplelen; ++i) {
@@ -209,45 +209,45 @@ void TCodepageTest::TestUTFFromUnknownPlane() {
     }
     UNIT_ASSERT_VALUES_EQUAL(TStringBuf(bytebuffer, writtenbytes), TStringBuf(bytebuffer2, writtenbytes2));
 
-    wchar32 charbuffer[BUFFER_SIZE]; 
-    size_t readbytes = 0; 
-    size_t writtenchars = 0; 
- 
+    wchar32 charbuffer[BUFFER_SIZE];
+    size_t readbytes = 0;
+    size_t writtenchars = 0;
+
     res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
- 
+
     UNIT_ASSERT(res == RECODE_OK);
     UNIT_ASSERT(readbytes == writtenbytes);
- 
-    wchar32* charbufferend = charbuffer + writtenchars; 
+
+    wchar32* charbufferend = charbuffer + writtenchars;
     DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
- 
+
     UNIT_ASSERT(charbufferend == charbuffer + samplelen);
     for (size_t i = 0; i < samplelen; ++i)
         UNIT_ASSERT(sampletext[i] == charbuffer[i]);
- 
-    // Now, concatenate the thing with an explicit character and retest 
+
+    // Now, concatenate the thing with an explicit character and retest
     res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
     UNIT_ASSERT(res == RECODE_OK);
     UNIT_ASSERT(readbytes == writtenbytes);
- 
-    charbuffer[writtenchars] = 0x1234; 
- 
-    size_t morewrittenchars = 0; 
+
+    charbuffer[writtenchars] = 0x1234;
+
+    size_t morewrittenchars = 0;
     res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars);
     UNIT_ASSERT(res == RECODE_OK);
     UNIT_ASSERT(readbytes == writtenbytes);
     UNIT_ASSERT(writtenchars == morewrittenchars);
- 
-    charbuffer[2 * writtenchars + 1] = 0x5678; 
- 
-    charbufferend = charbuffer + 2 * writtenchars + 2; 
+
+    charbuffer[2 * writtenchars + 1] = 0x5678;
+
+    charbufferend = charbuffer + 2 * writtenchars + 2;
     DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
- 
+
     UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2);
-    for (size_t i = 0; i < samplelen; ++i) { 
+    for (size_t i = 0; i < samplelen; ++i) {
         UNIT_ASSERT(sampletext[i] == charbuffer[i]);
         UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]);
-    } 
+    }
     UNIT_ASSERT(0x1234 == charbuffer[samplelen]);
     UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]);
 
@@ -265,8 +265,8 @@ void TCodepageTest::TestUTFFromUnknownPlane() {
             UNIT_ASSERT_VALUES_EQUAL(wtr[i], sampletext[i]);
         }
     }
-} 
- 
+}
+
 static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) {
     size_t sSize = strlen(str);
     size_t wSize = sSize * 2;
@@ -298,24 +298,24 @@ void TCodepageTest::TestSurrogatePairs() {
     TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2));
 }
 
-void TCodepageTest::TestEncodingHints() { 
-    UNIT_ASSERT(CODES_WIN == EncodingHintByName("windows-1251")); 
-    UNIT_ASSERT(CODES_WIN == EncodingHintByName("Windows1251")); 
-    UNIT_ASSERT(CODES_WIN == EncodingHintByName("WIN1251")); 
-    UNIT_ASSERT(CODES_WIN == EncodingHintByName("window-cp1251")); 
-    UNIT_ASSERT(CODES_WIN == EncodingHintByName("!!!CP1251???")); 
-    UNIT_ASSERT(CODES_WIN == EncodingHintByName("'ansi-cp1251;'")); 
-    UNIT_ASSERT(CODES_WIN == EncodingHintByName("charset=Microsoft-CP1251;")); 
- 
-    UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-8859-2")); 
-    UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-2")); 
-    UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-latin-2")); 
-    UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("charset=\"Latin2\";")); 
- 
-    UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("widow1251")); 
-    UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("default")); 
-    UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("$phpcharset")); 
- 
+void TCodepageTest::TestEncodingHints() {
+    UNIT_ASSERT(CODES_WIN == EncodingHintByName("windows-1251"));
+    UNIT_ASSERT(CODES_WIN == EncodingHintByName("Windows1251"));
+    UNIT_ASSERT(CODES_WIN == EncodingHintByName("WIN1251"));
+    UNIT_ASSERT(CODES_WIN == EncodingHintByName("window-cp1251"));
+    UNIT_ASSERT(CODES_WIN == EncodingHintByName("!!!CP1251???"));
+    UNIT_ASSERT(CODES_WIN == EncodingHintByName("'ansi-cp1251;'"));
+    UNIT_ASSERT(CODES_WIN == EncodingHintByName("charset=Microsoft-CP1251;"));
+
+    UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-8859-2"));
+    UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-2"));
+    UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-latin-2"));
+    UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("charset=\"Latin2\";"));
+
+    UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("widow1251"));
+    UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("default"));
+    UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("$phpcharset"));
+
     UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ShiftJIS"));
     UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("Shift_JIS"));
     UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("Big5"));
@@ -324,7 +324,7 @@ void TCodepageTest::TestEncodingHints() {
     UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("charset='Shift_JIS';;"));
     UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ISO-2022-KR"));
     UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ISO-2022-jp"));
-} 
+}
 
 void TCodepageTest::TestToLower() {
     TTempBuf buf;

+ 1 - 1
library/cpp/charset/doccodes.h

@@ -1,7 +1,7 @@
 #pragma once
 
 enum ECharset {
-    CODES_UNSUPPORTED = -2, // valid but unsupported encoding 
+    CODES_UNSUPPORTED = -2, // valid but unsupported encoding
     CODES_UNKNOWN = -1,     // invalid or unspecified encoding
     CODES_WIN,              // [ 0] WINDOWS_1251     Windows
     CODES_KOI8,             // [ 1] KOI8_U           Koi8-u

+ 14 - 14
library/cpp/charset/generated/cp_data.cpp

@@ -1,7 +1,7 @@
 #include <library/cpp/charset/codepage.h>
- 
-extern const char defchars[][DEFCHAR_BUF]; 
- 
+
+extern const char defchars[][DEFCHAR_BUF];
+
 static const CodePage CODES_ALT_CODE_PAGE = {
     CODES_ALT,
     {"IBM866", "csIBM866", "cp866", "866", "dos-866", "alt", "windows-866",},
@@ -3244,9 +3244,9 @@ static const CodePage CODES_TIS620_CODE_PAGE = {
 }; // generated from tis620.txt
 
 static const CodePage CODES_UNKNOWNPLANE_CODE_PAGE = {
-    CODES_UNKNOWNPLANE, 
+    CODES_UNKNOWNPLANE,
     {"unknownplane", "unknown-plane",},
-    { 
+    {
         0x0000, 0xF001, 0xF002, 0xF003, 0xF004, 0xF005, 0xF006, 0xF007,
         0xF008, 0xF009, 0xF00A, 0xF00B, 0xF00C, 0xF00D, 0xF00E, 0xF00F,
         0xF010, 0xF011, 0xF012, 0xF013, 0xF014, 0xF015, 0xF016, 0xF017,
@@ -3279,10 +3279,10 @@ static const CodePage CODES_UNKNOWNPLANE_CODE_PAGE = {
         0xF0E8, 0xF0E9, 0xF0EA, 0xF0EB, 0xF0EC, 0xF0ED, 0xF0EE, 0xF0EF,
         0xF0F0, 0xF0F1, 0xF0F2, 0xF0F3, 0xF0F4, 0xF0F5, 0xF0F6, 0xF0F7,
         0xF0F8, 0xF0F9, 0xF0FA, 0xF0FB, 0xF0FC, 0xF0FD, 0xF0FE, 0xF0FF,
-    }, 
-    defchars[0], 
+    },
+    defchars[0],
 }; // generated from unknown.txt
- 
+
 static const CodePage CODES_UTF8_CODE_PAGE = {
     CODES_UTF8,
     {"utf-8",},
@@ -3307,7 +3307,7 @@ static const CodePage CODES_UTF_16LE_CODE_PAGE = {
 static const CodePage CODES_VISCII_CODE_PAGE = {
     CODES_VISCII,
     {"VISCII", "VISCII1.1-1", "CSVISCII",},
-    { 
+    {
         0x0000, 0x0001, 0x1EB2, 0x0003, 0x0004, 0x1EB4, 0x1EAA, 0x0007,
         0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
         0x0010, 0x0011, 0x0012, 0x0013, 0x1EF6, 0x0015, 0x0016, 0x0017,
@@ -3660,15 +3660,15 @@ static const CodePage CODES_YANDEX_CODE_PAGE = {
         0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
         0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
         0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
-    }, 
+    },
     defchars[1],
 }; // generated from yandex.txt
- 
+
 const char defchars[][DEFCHAR_BUF] = {
     {"\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077\077"}, // generated from ascii_dc.txt
     {"\077\xA6\xB6\xA6\055\xB6\x9F\x9F\x9F\x9F\x9F\x9F\x9F\200\200\200\071\130\077\071\040\040\n\n\x1A\x1A\x1A\x1A\x1A\x1A\x1A\077\077\055\055\050\042\051\042\042\042\137\052\042\056\055\055\075\055\044\140\xB0\047\047\047\047\047"}, // generated from yandex_dc.txt
-}; 
- 
+};
+
 const CodePage* const NCodepagePrivate::TCodePageData::AllCodePages[] = {
     &CODES_ALT_CODE_PAGE,
     &CODES_ARMSCII_CODE_PAGE,
@@ -3783,6 +3783,6 @@ const CodePage* const NCodepagePrivate::TCodePageData::AllCodePages[] = {
     &CODES_WINDOWS_1257_CODE_PAGE,
     &CODES_WIN_EAST_CODE_PAGE,
     &CODES_YANDEX_CODE_PAGE,
-}; 
+};
 
 const CodePage& csYandex = CODES_YANDEX_CODE_PAGE;

Разница между файлами не показана из-за своего большого размера
+ 408 - 408
library/cpp/charset/generated/encrec_data.cpp


+ 1 - 1
library/cpp/containers/comptrie/comptrie.h

@@ -1,4 +1,4 @@
 #pragma once
- 
+
 #include "comptrie_trie.h"
 #include "comptrie_builder.h"

+ 10 - 10
library/cpp/containers/comptrie/comptrie_builder.h

@@ -1,5 +1,5 @@
 #pragma once
- 
+
 #include "comptrie_packer.h"
 #include "minimize.h"
 #include "key_selector.h"
@@ -12,7 +12,7 @@
 // is created incrementally. It actually helps a lot to have the input data prefix-grouped
 // by key; otherwise, memory consumption becomes a tough issue.
 // NOTE: building and serializing the automaton may be lengthy, and takes lots of memory.
- 
+
 // PREFIX_GROUPED means that if we, while constructing a trie, add to the builder two keys with the same prefix,
 // then all the keys that we add between these two also have the same prefix.
 // Actually in this mode the builder can accept even more freely ordered input,
@@ -45,16 +45,16 @@ public:
     typedef S TPacker;
     typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
     typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
- 
+
     explicit TCompactTrieBuilder(TCompactTrieBuilderFlags flags = CTBF_NONE, TPacker packer = TPacker(), IAllocator* alloc = TDefaultAllocator::Instance());
- 
+
     // All Add.. methods return true if it was a new key, false if the key already existed.
 
     bool Add(const TSymbol* key, size_t keylen, const TData& value);
     bool Add(const TKeyBuf& key, const TData& value) {
         return Add(key.data(), key.size(), value);
     }
- 
+
     // add already serialized data
     bool AddPtr(const TSymbol* key, size_t keylen, const char* data);
     bool AddPtr(const TKeyBuf& key, const char* data) {
@@ -80,14 +80,14 @@ public:
     bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr) const {
         return FindLongestPrefix(key.data(), key.size(), prefixLen, value);
     }
- 
+
     size_t Save(IOutputStream& os) const;
     size_t SaveAndDestroy(IOutputStream& os);
     size_t SaveToFile(const TString& fileName) const {
         TFixedBufferFileOutput out(fileName);
         return Save(out);
     }
- 
+
     void Clear(); // Returns all memory to the system and resets the builder state.
 
     size_t GetEntryCount() const;
@@ -101,8 +101,8 @@ public:
 protected:
     class TCompactTrieBuilderImpl;
     THolder<TCompactTrieBuilderImpl> Impl;
-}; 
- 
+};
+
 //----------------------------------------------------------------------------------------------------------------------
 // Minimize the trie. The result is equivalent to the original
 // trie, except that it takes less space (and has marginally lower
@@ -119,7 +119,7 @@ protected:
 
 template <class TPacker>
 size_t CompactTrieMinimize(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker(), NCompactTrie::EMinimizeMode mode = NCompactTrie::MM_DEFAULT);
- 
+
 template <class TTrieBuilder>
 size_t CompactTrieMinimize(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);
 

+ 4 - 4
library/cpp/containers/comptrie/comptrie_builder.inl

@@ -1,5 +1,5 @@
 #pragma once
- 
+
 #include "comptrie_impl.h"
 #include "comptrie_trie.h"
 #include "make_fast_layout.h"
@@ -618,7 +618,7 @@ typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode*
     } else {
         ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen);
     }
- 
+
     char* ckey = ckeybuf.Data();
 
     TNode* next;
@@ -907,9 +907,9 @@ ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcMeasure(
     size_t leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize) : 0;
     size_t rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize) : 0;
     leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize + leftoffsetsize + rightoffsetsize) : 0;
-    rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0; 
+    rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0;
     leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize + leftoffsetsize + rightoffsetsize) : 0;
-    rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0; 
+    rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0;
 
     coresize += leftoffsetsize + rightoffsetsize;
     thiz->LeftOffset = leftsize ? coresize + treesize : 0;

+ 6 - 6
library/cpp/containers/comptrie/comptrie_impl.cpp

@@ -8,26 +8,26 @@
 namespace NCompactTrie {
     size_t MeasureOffset(size_t offset) {
         int n = 0;
- 
+
         while (offset) {
             offset >>= 8;
             ++n;
         }
 
         return n;
-    } 
+    }
 
     size_t PackOffset(char* buffer, size_t offset) {
         size_t len = MeasureOffset(offset);
         size_t i = len;
- 
+
         while (i--) {
             buffer[i] = (char)(offset & 0xFF);
             offset >>= 8;
         }
 
         return len;
-    } 
+    }
 
     void ShowProgress(size_t n) {
         if (n % 1000000 == 0)
@@ -35,5 +35,5 @@ namespace NCompactTrie {
         else if (n % 20000 == 0)
             Cerr << ".";
     }
- 
-} 
+
+}

+ 2 - 2
library/cpp/containers/comptrie/comptrie_impl.h

@@ -1,7 +1,7 @@
 #pragma once
- 
+
 #include <util/stream/output.h>
- 
+
 #ifndef COMPTRIE_DATA_CHECK
 #define COMPTRIE_DATA_CHECK 1
 #endif

Некоторые файлы не были показаны из-за большого количества измененных файлов