123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833 |
- /* Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License, version 2.0,
- as published by the Free Software Foundation.
- This program is also distributed with certain software (including
- but not limited to OpenSSL) that is licensed under separate terms,
- as designated in a particular file or component or in included license
- documentation. The authors of MySQL hereby grant you an additional
- permission to link the program and your derivative works with the
- separately licensed software that they have included with MySQL.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License, version 2.0, for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
- /*
- This file is used to dump DUCET 9.0.0 to table we use in MySQL collations.
- It is created on the basis of uca-dump.cc file for 5.2.0. It is changed to
- dump all 3 levels into one table.
- How to use:
- 1. g++ uca9-dump.cc -o uca9dump
- 2. uca9dump ducet --in_file=/path/to/allkeys.txt --out_file=/path/to/youfile
- This can also be used to dump weight table of Japanese Han characters.
- How to use:
- 1. Copy the line of Han characters in CLDR file ja.xml to a seperate file,
- e.g. ja_han.txt.
- 2. Make sure the file is saved in UTF-8 (use 'file' command to check), or
- use iconv to convert.
- 3. uca9dump ja --in_file=/path/to/ja_han.txt --out_file=/path/to/yourfile
- This can also be used to dump the weight tables of Chinese Han characters.
- How to use:
- 1. Make sure you have uca900_weights and all the weight tables in strings/
- uca900_data.h. If no, please refer to above comments about how to
- generate those tables.
- 2. Copy the lines of Han characters in CLDR file zh.xml to a seperate
- file, e.g. zh_han.txt.
- 3. Make sure the file is saved in UTF-8 (use 'file' command to check), or
- use iconv to convert.
- 4. Remove all the comments ("# XX") at the end of each line. And remove
- all the "<*" at the beginning of each line. "<*" means for all the
- characters in this line, each character should be greater than its
- previous character. We'll do this with uca9dump. And also remove the
- lines like '\uFDD0A #index A'. These lines mark the beginning of each
- group of characters which have similar pronunciation. They don't
- affect how we arrange the weight of the characters.
- 5. Join all the lines into one.
- 6. uca9dump zh --in_file=/path/to/zh_han.txt --out_file=/path/to/yourfile
- */
- #include <assert.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <algorithm>
- #include <map>
- #include <set>
- #include "m_string.h"
- #include "my_compiler.h"
- #include "strings/mb_wc.h"
- #include "strings/uca900_data.h" // uca900_weights[]
- typedef unsigned char uchar;
- typedef unsigned short uint16;
- typedef unsigned int uint;
- typedef unsigned long my_wc_t;
- #define MY_UCA_MAXWEIGHT_TO_PARSE 64
- #define MY_UCA_MAXCE_TO_PARSE 18
- #define MY_UCA_MAXWEIGHT_TO_DUMP 24
- #define MY_UCA_MAXCE_TO_DUMP 8
- #define MY_UCA_VERSION_SIZE 32
- #define MY_UCA_CE_SIZE 3
- #define MY_UCA_MAX_CONTRACTION 6
- #define MY_UCA_MAXCHAR (0x10FFFF + 1)
- #define MY_UCA_CHARS_PER_PAGE 256
- #define MY_UCA_PSHIFT 8
- #define MY_UCA_NPAGES MY_UCA_MAXCHAR / MY_UCA_CHARS_PER_PAGE
- struct MY_UCA_ITEM {
- int num_of_ce; /* Number of collation elements */
- uint16 weight[MY_UCA_MAXWEIGHT_TO_DUMP + 1];
- /* +1 for trailing num_of_ce */
- };
- struct MY_UCA {
- char version[MY_UCA_VERSION_SIZE];
- MY_UCA_ITEM item[MY_UCA_MAXCHAR]; // Weight info of all characters
- };
- static int load_uca_file(MY_UCA *uca, int maxchar, int *pageloaded,
- FILE *infile) {
- char str[512];
- int out_of_range_chars = 0;
- for (int lineno = 0; fgets(str, sizeof(str), infile); lineno++) {
- /* Skip comment lines */
- if (*str == '\r' || *str == '\n' || *str == '#') continue;
- /* Detect version */
- if (*str == '@') {
- if (!strncmp(str, "@version ", 9)) {
- const char *value;
- if (strtok(str, " \r\n\t") && (value = strtok(nullptr, " \r\n\t")))
- snprintf(uca->version, MY_UCA_VERSION_SIZE, "%s", value);
- }
- continue;
- }
- int code;
- /* Skip big characters */
- if ((code = strtol(str, nullptr, 16)) > maxchar) {
- out_of_range_chars++;
- continue;
- }
- char *comment;
- if (!(comment = strchr(str, '#'))) {
- fprintf(stderr, "Warning: could not parse line #%d:\n'%s'\n", lineno,
- str);
- continue;
- }
- *comment = '\0';
- char *weight;
- if ((weight = strchr(str, ';'))) {
- *weight++ = '\0';
- weight += strspn(weight, " ");
- } else {
- fprintf(stderr, "Warning: could not parse line #%d:\n%s\n", lineno, str);
- continue;
- }
- char *s;
- int codenum;
- for (codenum = 0, s = strtok(str, " \t"); s;
- codenum++, s = strtok(nullptr, " \t")) {
- /* Meet a contraction. To handle in the future. */
- if (codenum >= 1) {
- codenum++;
- break;
- }
- }
- MY_UCA_ITEM *item = nullptr;
- if (codenum > 1) {
- /* Contractions we don't support. */
- continue;
- } else {
- item = &uca->item[code];
- }
- /*
- Split weight string into separate weights
- "[p1.s1.t1.q1][p2.s2.t2.q2][p3.s3.t3.q3]" ->
- "p1.s1.t1.q1" "p2.s2.t2.q2" "p3.s3.t3.q3"
- */
- item->num_of_ce = 0;
- s = strtok(weight, " []");
- char *weights[MY_UCA_MAXWEIGHT_TO_PARSE];
- while (s) {
- if (item->num_of_ce >= MY_UCA_MAXCE_TO_PARSE) {
- fprintf(stderr, "Line #%d has more than %d collation elements\n",
- lineno, MY_UCA_MAXCE_TO_PARSE);
- fprintf(stderr, "Can't continue.\n");
- exit(1);
- }
- weights[item->num_of_ce] = s;
- s = strtok(nullptr, " []");
- item->num_of_ce++;
- }
- for (int i = 0; i < item->num_of_ce; i++) {
- /*
- The longest collation element in DUCET is assigned to 0xFDFA. It
- has 18 collation elements. The second longest is 8. Because 8
- collation elements is enough to distict 0xFDFA from other
- characters, we skip the extra weights and only use 8 here.
- */
- if (i >= MY_UCA_MAXCE_TO_DUMP) {
- fprintf(stderr,
- "Warning: at line %d: character %04X has"
- " more than %d collation elements (%d). "
- "Skipping the extra weights.\n",
- lineno, code, MY_UCA_MAXCE_TO_DUMP, item->num_of_ce);
- item->num_of_ce = MY_UCA_MAXCE_TO_DUMP;
- break;
- }
- int weight_of_ce = 0;
- for (s = weights[i]; *s;) {
- char *endptr;
- int part = strtol(s + 1, &endptr, 16);
- if (i < MY_UCA_MAXCE_TO_DUMP) {
- item->weight[i * MY_UCA_CE_SIZE + weight_of_ce] = part;
- } else {
- fprintf(stderr, "Too many weights (%d) at line %d\n", i, lineno);
- exit(1);
- }
- s = endptr;
- weight_of_ce++;
- }
- }
- /* Mark that a character from this page was loaded */
- pageloaded[code >> MY_UCA_PSHIFT]++;
- }
- if (out_of_range_chars)
- fprintf(stderr, "%d out-of-range characters skipped\n", out_of_range_chars);
- return 0;
- }
- #define HANGUL_JAMO_MAX_LENGTH 3
- static int my_decompose_hangul_syllable(my_wc_t syllable, my_wc_t *jamo) {
- if (syllable < 0xAC00 || syllable > 0xD7AF) return 0;
- constexpr int syllable_base = 0xAC00;
- constexpr int leadingjamo_base = 0x1100;
- constexpr int voweljamo_base = 0x1161;
- constexpr int trailingjamo_base = 0x11A7;
- constexpr int voweljamo_cnt = 21;
- constexpr int trailingjamo_cnt = 28;
- int syllable_index = syllable - syllable_base;
- int v_t_combination = voweljamo_cnt * trailingjamo_cnt;
- int leadingjamo_index = syllable_index / v_t_combination;
- int voweljamo_index = (syllable_index % v_t_combination) / trailingjamo_cnt;
- int trailingjamo_index = syllable_index % trailingjamo_cnt;
- jamo[0] = leadingjamo_base + leadingjamo_index;
- jamo[1] = voweljamo_base + voweljamo_index;
- jamo[2] = trailingjamo_index ? (trailingjamo_base + trailingjamo_index) : 0;
- return trailingjamo_index ? 3 : 2;
- }
- void my_put_jamo_weights(const my_wc_t *hangul_jamo, int jamo_cnt,
- MY_UCA_ITEM *item, const MY_UCA *uca) {
- for (int jamoind = 0; jamoind < jamo_cnt; jamoind++) {
- uint16 *implicit_weight = item->weight + jamoind * MY_UCA_CE_SIZE;
- const uint16 *jamo_weight = uca->item[hangul_jamo[jamoind]].weight;
- *implicit_weight = *jamo_weight;
- *(implicit_weight + 1) = *(jamo_weight + 1);
- *(implicit_weight + 2) = *(jamo_weight + 2) + 1;
- }
- item->num_of_ce = jamo_cnt;
- }
- static void set_implicit_weights(MY_UCA_ITEM *item, int code) {
- int base, aaaa, bbbb;
- if (code >= 0x17000 && code <= 0x18AFF) // Tangut character
- {
- aaaa = 0xFB00;
- bbbb = (code - 0x17000) | 0x8000;
- } else {
- /* non-Core Han Unified Ideographs */
- if ((code >= 0x3400 && code <= 0x4DB5) ||
- (code >= 0x20000 && code <= 0x2A6D6) ||
- (code >= 0x2A700 && code <= 0x2B734) ||
- (code >= 0x2B740 && code <= 0x2B81D) ||
- (code >= 0x2B820 && code <= 0x2CEA1))
- base = 0xFB80;
- /* Core Han Unified Ideographs */
- else if ((code >= 0x4E00 && code <= 0x9FD5) ||
- (code >= 0xFA0E && code <= 0xFA29))
- base = 0xFB40;
- /* All other characters whose weight is unassigned */
- else
- base = 0xFBC0;
- aaaa = base + (code >> 15);
- bbbb = (code & 0x7FFF) | 0x8000;
- }
- item->weight[0] = aaaa;
- item->weight[1] = 0x0020;
- item->weight[2] = 0x0002;
- item->weight[3] = bbbb;
- item->weight[4] = 0x0000;
- item->weight[5] = 0x0000;
- item->num_of_ce = 2;
- }
- /*
- We need to initialize implicit weights because
- some pages have both implicit and explicit weights:
- 0x4D??, 0x9F??
- */
- static void set_implicit_weights(MY_UCA *uca, const int *pageloaded) {
- for (int page = 0; page < MY_UCA_NPAGES; page++) {
- if (pageloaded[page] == MY_UCA_CHARS_PER_PAGE) continue;
- /* Now set implicit weights */
- for (int code = page * MY_UCA_CHARS_PER_PAGE;
- code < (page + 1) * MY_UCA_CHARS_PER_PAGE; code++) {
- MY_UCA_ITEM *item = &uca->item[code];
- if (item->num_of_ce) continue;
- int jamo_cnt = 0;
- my_wc_t hangul_jamo[HANGUL_JAMO_MAX_LENGTH];
- if ((jamo_cnt = my_decompose_hangul_syllable(code, hangul_jamo))) {
- my_put_jamo_weights(hangul_jamo, jamo_cnt, item, uca);
- continue;
- }
- set_implicit_weights(item, code);
- }
- }
- }
- static void get_page_statistics(const MY_UCA *uca, int page, int *maxnum) {
- for (int offs = 0; offs < MY_UCA_CHARS_PER_PAGE; offs++) {
- const MY_UCA_ITEM *item = &uca->item[page * MY_UCA_CHARS_PER_PAGE + offs];
- *maxnum = *maxnum < item->num_of_ce ? item->num_of_ce : *maxnum;
- }
- }
- /*
- Compose the prefix name of weight tables from the version number.
- */
- static char *prefix_name(const MY_UCA *uca) {
- static char prefix[MY_UCA_VERSION_SIZE];
- const char *s;
- char *d;
- strcpy(prefix, "uca");
- for (s = uca->version, d = prefix + strlen(prefix); *s; s++) {
- if ((*s >= '0' && *s <= '9') || (*s >= 'a' && *s <= 'z')) *d++ = *s;
- }
- *d = '\0';
- return prefix;
- }
- static char *page_name(const MY_UCA *uca, int page, bool pageloaded) {
- static char page_name_buf[120];
- static char page_name_null[] = "NULL";
- if (pageloaded) {
- snprintf(page_name_buf, sizeof(page_name_buf), "%s_p%03X", prefix_name(uca),
- page);
- return page_name_buf;
- } else
- return page_name_null;
- }
- static void print_one_page(const MY_UCA *uca, int page,
- const char *page_name_prefix, int maxnum,
- FILE *outfile) {
- if (page_name_prefix == nullptr)
- fprintf(outfile, "uint16 %s[] = {\n", page_name(uca, page, true));
- else
- fprintf(outfile, "uint16 %s%03X[] = {\n", page_name_prefix, page);
- fprintf(outfile, " /* Number of CEs for each character. */\n");
- for (int offs = 0; offs < MY_UCA_CHARS_PER_PAGE; ++offs) {
- const int code = page * MY_UCA_CHARS_PER_PAGE + offs;
- const MY_UCA_ITEM *item = &uca->item[code];
- if ((offs % 16) == 0) fprintf(outfile, " ");
- fprintf(outfile, "%d, ", item->num_of_ce);
- if ((offs % 16) == 15) fprintf(outfile, "\n");
- }
- for (int i = 0; i < maxnum - 1; i++) {
- fprintf(outfile, "\n");
- if ((i % 3) == 0) {
- fprintf(outfile, " /* Primary weight %d for each character. */\n",
- i / 3 + 1);
- } else if ((i % 3) == 1) {
- fprintf(outfile, " /* Secondary weight %d for each character. */\n",
- i / 3 + 1);
- } else {
- fprintf(outfile, " /* Tertiary weight %d for each character. */\n",
- i / 3 + 1);
- }
- for (int offs = 0; offs < MY_UCA_CHARS_PER_PAGE; offs++) {
- const int code = page * MY_UCA_CHARS_PER_PAGE + offs;
- const MY_UCA_ITEM *item = &uca->item[code];
- const uint16 *weight = item->weight;
- fprintf(outfile, " 0x%04X, /* U+%04X */\n", weight[i], code);
- }
- }
- fprintf(outfile, "};\n\n");
- }
- /*
- This function is called to read in language specific data.
- */
- int read_in_lang_data(char *inbytes, int maxbytes, FILE *infile) {
- do {
- if (!fgets((char *)inbytes, maxbytes, infile)) {
- fprintf(stderr, "Could not read more characters.\n");
- return -1;
- }
- } while (!strncmp((char *)inbytes, "#", 1)); // Jump over copyright info.
- return 0;
- }
- int dump_ja_hans(MY_UCA *uca, FILE *infile, FILE *outfile) {
- // There are 6355 Japanese Han characters.
- unsigned char ja_u8_bytes[8000 * 3] = {0};
- if (read_in_lang_data((char *)ja_u8_bytes, sizeof(ja_u8_bytes), infile))
- return 1;
- int ja_length = strlen((char *)ja_u8_bytes);
- while (ja_length > 0 && (ja_u8_bytes[ja_length - 1] == '\n' ||
- ja_u8_bytes[ja_length - 1] == '\r')) {
- ja_u8_bytes[ja_length - 1] = '\0';
- ja_length--;
- }
- // All these Japanese Han characters should be 3 bytes.
- if ((ja_length % 3)) {
- fprintf(stderr, "Wrong UTF8 Han character bytes.\n");
- return 1;
- }
- int han_cnt = ja_length / 3;
- const int JA_CORE_HAN_BASE_WT = 0x54A4;
- const int ja_han_page_cnt = 0x9F - 0x4E + 1;
- // Set weight for Japanese Han characters.
- unsigned char *ja_han = ja_u8_bytes;
- int min_page = 0x1100; // the max code point utf8mb4 supports is 0x10FFFF.
- int max_page = 0;
- for (int i = 0; i < han_cnt; i++) {
- my_wc_t ja_ch_u16 = 0;
- int bytes = my_mb_wc_utf8mb4(&ja_ch_u16, ja_han, ja_han + ja_length);
- if (bytes <= 0) break;
- ja_han += bytes;
- int page MY_ATTRIBUTE((unused)) = ja_ch_u16 >> 8;
- assert(page >= 0x4E && page <= 0x9F);
- MY_UCA_ITEM *item = &uca->item[ja_ch_u16 - 0x4E00];
- item->num_of_ce = 1;
- item->weight[0] = JA_CORE_HAN_BASE_WT + i;
- item->weight[1] = 0x20;
- item->weight[2] = 0x02;
- min_page = std::min(min_page, page);
- max_page = std::max(max_page, page);
- }
- // Set implicit weight for non-Japanese characters.
- for (int page = min_page; page <= max_page; page++) {
- for (int offs = 0; offs < MY_UCA_CHARS_PER_PAGE; ++offs) {
- int code = (page << 8) + offs;
- int ind = code - 0x4E00;
- MY_UCA_ITEM *item = &uca->item[ind];
- if (item->num_of_ce == 0) set_implicit_weights(item, code);
- }
- }
- fprintf(outfile, "#include \"my_inttypes.h\"\n\n");
- fprintf(outfile, "extern const int MIN_JA_HAN_PAGE = 0x%X;\n", min_page);
- fprintf(outfile, "extern const int MAX_JA_HAN_PAGE = 0x%X;\n\n", max_page);
- // Print weights.
- for (int page = 0; page < ja_han_page_cnt; page++) {
- fprintf(outfile, "uint16 ja_han_page%2X[]= {\n", min_page + page);
- fprintf(outfile, " /* Number of CEs for each character. */\n");
- for (int offs = 0; offs < MY_UCA_CHARS_PER_PAGE; ++offs) {
- int ind = (page << 8) + offs;
- MY_UCA_ITEM *item = &uca->item[ind];
- if ((offs % 16) == 0) fprintf(outfile, " ");
- fprintf(outfile, "%d, ", item->num_of_ce);
- if ((offs % 16) == 15) fprintf(outfile, "\n");
- }
- for (int i = 0; i < 6; i++) {
- fprintf(outfile, "\n");
- if ((i % 3) == 0) {
- fprintf(outfile, " /* Primary weight %d for each character. */\n",
- i / 3 + 1);
- } else if ((i % 3) == 1) {
- fprintf(outfile, " /* Secondary weight %d for each character. */\n",
- i / 3 + 1);
- } else {
- fprintf(outfile, " /* Tertiary weight %d for each character. */\n",
- i / 3 + 1);
- }
- for (int offs = 0; offs < MY_UCA_CHARS_PER_PAGE; offs++) {
- const int ind = page * MY_UCA_CHARS_PER_PAGE + offs;
- const int code = (page + min_page) * MY_UCA_CHARS_PER_PAGE + offs;
- const MY_UCA_ITEM *item = &uca->item[ind];
- const uint16 *weight = item->weight;
- fprintf(outfile, " 0x%04X, /* U+%04X */\n", weight[i], code);
- }
- }
- fprintf(outfile, "};\n\n");
- }
- /* Print page index */
- fprintf(outfile, "uint16* ja_han_pages[%d]= {\n", ja_han_page_cnt);
- for (int page = 0; page < ja_han_page_cnt; page++) {
- if (!(page % 5))
- fprintf(outfile, "%13s%2X", "ja_han_page", page + min_page);
- else
- fprintf(outfile, "%12s%2X", "ja_han_page", page + min_page);
- if ((page + 1) != ja_han_page_cnt) fprintf(outfile, ",");
- if (!((page + 1) % 5) || (page + 1) == ja_han_page_cnt)
- fprintf(outfile, "\n");
- }
- fprintf(outfile, "};\n\n");
- return 0;
- }
- /*
- Chinese Han characters are assigned an implicit weight according to the
- Unicode Collation Algorithm. But when creating our Chinese collation for
- utf8mb4, to implement this language's reorder rule, we decide to give the Han
- characters in CLDR zh.xml file the weight value from 0x1C47 to 0xBDBE, and let
- the other Han characters still have their implicit weight. Per UCA, the
- smallest leading primary weight of the implicit weight is 0xFB00, and the
- largest primary weight we ocuppy for the Han characters in zh.xml is 0xBDBE.
- There is a huge gap between these two weight values. To use this weight value
- gap and let the character groups like Latin, Cyrillic, have single primary
- weight as before reordering, we decide to change the leading primary weight of
- the implicit weight as below.
- */
- uint16 change_zh_implicit(uint16 weight) {
- switch (weight) {
- case 0xFB00:
- return 0xF621;
- case 0xFB40:
- return 0xBDBF;
- case 0xFB41:
- return 0xBDC0;
- case 0xFB80:
- return 0xBDC1;
- case 0xFB84:
- return 0xBDC2;
- case 0xFB85:
- return 0xBDC3;
- default:
- return weight + 0xF622 - 0xFBC0;
- }
- }
- /*
- UCA defines an algorithm to calculate character's implicit weight if this
- character's weight is not defined in the DUCET. This function is to help
- convert Chinese character's implicit weight calculated by UCA back to its code
- points.
- The implicit weight and the code point is not 1 : 1 map. But for the Han
- characters in zh.xml file, each one has unique implicit weight from others.
- */
- my_wc_t convert_implicit_to_ch(uint16 first, uint16 second) {
- assert(first >= 0xFB40 && first <= 0xFBC1);
- if (first < 0xFB80)
- return (((first - 0xFB40) << 15) | (second & 0x7FFF));
- else if (first < 0xFBC0)
- return (((first - 0xFB80) << 15) | (second & 0x7FFF));
- else
- return (((first - 0xFBC0) << 15) | (second & 0x7FFF));
- }
- int dump_zh_hans(MY_UCA *uca, int *pageloaded, FILE *infile, FILE *outfile) {
- /*
- zh.xml of cldr v33 defines 41336 Chinese Han characters. This xml file is
- encoded in utf8. Most of the Han characters are encoded in 3 bytes, and some
- are encoded in 4 bytes.
- */
- constexpr int ZH_HAN_CNT = 41336;
- unsigned char zh_bytes[ZH_HAN_CNT * 4]{0};
- if (read_in_lang_data((char *)zh_bytes, sizeof(zh_bytes), infile)) return 1;
- /*
- Since the rule [reorder Hani], Chinese Han character's weight should be
- smaller than any other non-ignorable characters (except of the core
- characters like spaces, symbols).
- To make the reordering, we decide to change the weight of all characters
- as:
- Char Group | Origin Weight Range | Reordered Weight Range
- -------------|-----------------------------|----------------------------
- core chars | 0200 - 1C46 | 0200 - 1C46
- Han in zh.xml| [FB40, AAAA] - [FB85, BBBB] | 1C47 - BDBE
- Other Han | [FB40, CCCC] - [FB85, DDDD] | [BDBF, CCCC] - [BDC3, DDDD]
- Latin, etc | 1C47 - 54A3 | BDC4 - F620
- Others | [FBC0, XXXX] - [FBE1, YYYY] | [F621, XXXX] - [F642, YYYY]
- This function changes only the weight of the Han characters defined in
- zh.xml and other characters in the same pages these Han characters reside.
- */
- constexpr int ZH_CORE_HAN_BASE_WT = 0x1C47;
- std::map<int, int> zh_han_to_single_weight_map;
- unsigned char *zh_ch = zh_bytes;
- int zh_len = strlen((char *)zh_bytes);
- int min_page = 0x1100; // the max code point utf8mb4 supports is 0x10FFFF.
- int max_page = 0;
- for (int i = 0; i < ZH_HAN_CNT; i++) {
- my_wc_t ch = 0;
- int bytes = my_mb_wc_utf8mb4(&ch, zh_ch, zh_ch + zh_len);
- if (bytes <= 0) break;
- zh_ch += bytes;
- int page = ch >> 8;
- uca->item[ch].num_of_ce = 1;
- uca->item[ch].weight[0] = ZH_CORE_HAN_BASE_WT + i;
- uca->item[ch].weight[1] = 0x20;
- uca->item[ch].weight[2] = 0x02;
- pageloaded[page]++;
- min_page = std::min(min_page, page);
- max_page = std::max(max_page, page);
- MY_UCA_ITEM tmp_item;
- set_implicit_weights(&tmp_item, ch);
- zh_han_to_single_weight_map[ch] = ZH_CORE_HAN_BASE_WT + i;
- }
- // Chinese Han characters defined in zh.xml are all in pages 0x2E ~ 0x9F and
- // pages 0x200 ~ 0x2B8.
- for (int page = min_page; page <= max_page; page++) {
- if (pageloaded[page]) {
- // There is same page in DUCET.
- if (uca900_weight[page]) {
- for (int off = 0; off < MY_UCA_CHARS_PER_PAGE; off++) {
- int ch = (page << 8) + off;
- // Copy other characters' weight from DUCET.
- if (uca->item[ch].num_of_ce == 0) {
- uca->item[ch].num_of_ce =
- UCA900_NUM_OF_CE(uca900_weight[page], off);
- for (int level = 0; level < 3; level++) {
- uint16 *weight =
- UCA900_WEIGHT_ADDR(uca900_weight[page], level, off);
- uint16 *dst = uca->item[ch].weight + level;
- for (int ce = 0; ce < uca->item[ch].num_of_ce; ce++) {
- if (*weight >= 0x1C47 && *weight <= 0x54A3) {
- *dst = *weight + 0xBDC4 - 0x1C47;
- } else if (*weight >= 0xFB00) { // implicit weight
- uint16 next_implicit =
- *(weight + UCA900_DISTANCE_BETWEEN_WEIGHTS);
- my_wc_t ch = convert_implicit_to_ch(*weight, next_implicit);
- if (zh_han_to_single_weight_map.find(ch) !=
- zh_han_to_single_weight_map.end()) {
- *dst = zh_han_to_single_weight_map[ch];
- dst += 3;
- weight += UCA900_DISTANCE_BETWEEN_WEIGHTS;
- ce++;
- } else {
- *dst = change_zh_implicit(*weight);
- dst += 3;
- weight += UCA900_DISTANCE_BETWEEN_WEIGHTS;
- ce++;
- *dst = *weight;
- dst += 3;
- weight += UCA900_DISTANCE_BETWEEN_WEIGHTS;
- }
- } else {
- *dst = *weight;
- }
- dst += 3;
- weight += UCA900_DISTANCE_BETWEEN_WEIGHTS;
- }
- }
- }
- }
- } else {
- for (int off = 0; off < MY_UCA_CHARS_PER_PAGE; off++) {
- int ch = (page << 8) + off;
- if (uca->item[ch].num_of_ce == 0) {
- // calculate its implicit weight.
- set_implicit_weights(&uca->item[ch], ch);
- // Only the first primary weight needs to be changed in place.
- uca->item[ch].weight[0] =
- change_zh_implicit(uca->item[ch].weight[0]);
- }
- }
- }
- }
- }
- fprintf(outfile, "#include \"my_inttypes.h\"\n\n");
- fprintf(outfile, "extern const int MIN_ZH_HAN_PAGE = 0x%X;\n", min_page);
- fprintf(outfile, "extern const int MAX_ZH_HAN_PAGE = 0x%X;\n\n", max_page);
- for (int page = min_page; page <= max_page; page++) {
- if (pageloaded[page]) {
- int maxnum = 0;
- get_page_statistics(uca, page, &maxnum);
- maxnum = maxnum * MY_UCA_CE_SIZE + 1;
- print_one_page(uca, page, "zh_han_p", maxnum, outfile);
- }
- }
- fprintf(outfile, "uint16* zh_han_pages[%d] = {\n", max_page - min_page + 1);
- for (int page = min_page; page <= max_page; page++) {
- if (!((page - min_page) % 5)) {
- if (pageloaded[page]) {
- fprintf(outfile, "%10s%03X", "zh_han_p", page);
- } else {
- fprintf(outfile, "%13s", "NULL");
- }
- } else {
- if (pageloaded[page]) {
- fprintf(outfile, "%9s%03X", "zh_han_p", page);
- } else {
- fprintf(outfile, "%12s", "NULL");
- }
- }
- if ((page - min_page + 1) != MY_UCA_NPAGES) fprintf(outfile, ",");
- if (!((page - min_page + 1) % 5) || (page - min_page + 1) == MY_UCA_NPAGES)
- fprintf(outfile, "\n");
- }
- fprintf(outfile, "\n};\n\n");
- fprintf(outfile, "int zh_han_to_single_weight[] = {\n");
- for (auto map_it = zh_han_to_single_weight_map.begin();
- map_it != zh_han_to_single_weight_map.end(); map_it++) {
- fprintf(outfile, " 0x%05X, 0x%04X,\n", map_it->first, map_it->second);
- }
- fprintf(outfile, "\n};\n\n");
- fprintf(outfile, "extern const int ZH_HAN_WEIGHT_PAIRS = %lu;\n",
- static_cast<unsigned long>(zh_han_to_single_weight_map.size()));
- return 0;
- }
- enum OPT_DUMP { DUCET_DUMP, JA_DUMP, ZH_DUMP, DUMP_ERROR };
- OPT_DUMP handle_options(int ac, char **av, char **infilename,
- char **outfilename) {
- if (ac != 4) return DUMP_ERROR;
- if (!native_strcasecmp(av[1], "ducet") || !native_strcasecmp(av[1], "ja") ||
- !native_strcasecmp(av[1], "zh")) {
- if (!native_strncasecmp(av[2], "--in_file=", 10)) *infilename = av[2] + 10;
- if (!native_strncasecmp(av[3], "--out_file=", 11))
- *outfilename = av[3] + 11;
- if (*infilename == nullptr || *outfilename == nullptr) return DUMP_ERROR;
- if (!native_strcasecmp(av[1], "ducet")) return DUCET_DUMP;
- if (!native_strcasecmp(av[1], "ja")) return JA_DUMP;
- if (!native_strcasecmp(av[1], "zh")) return ZH_DUMP;
- }
- return DUMP_ERROR;
- }
- int dump_ducet(MY_UCA *uca, int *pageloaded, FILE *infile, FILE *outfile) {
- int maxchar = MY_UCA_MAXCHAR;
- load_uca_file(uca, maxchar, pageloaded, infile);
- set_implicit_weights(uca, pageloaded);
- int pagemaxlen[MY_UCA_NPAGES];
- for (int page = 0; page < MY_UCA_NPAGES; page++) {
- int maxnum = 0;
- pagemaxlen[page] = 0;
- /* Skip this page if no weights were loaded */
- if (!pageloaded[page]) continue;
- /*
- Calculate number of weights per character
- and number of default weights.
- */
- get_page_statistics(uca, page, &maxnum);
- maxnum = maxnum * MY_UCA_CE_SIZE + 1;
- pagemaxlen[page] = maxnum;
- print_one_page(uca, page, nullptr, maxnum, outfile);
- }
- /* Print page index */
- fprintf(outfile, "uint16* %s_weight[%d]= {\n", prefix_name(uca),
- MY_UCA_NPAGES);
- for (int page = 0; page < MY_UCA_NPAGES; page++) {
- if (!(page % 6))
- fprintf(outfile, "%13s", page_name(uca, page, pagemaxlen[page]));
- else
- fprintf(outfile, "%12s", page_name(uca, page, pagemaxlen[page]));
- if ((page + 1) != MY_UCA_NPAGES) fprintf(outfile, ",");
- if (!((page + 1) % 6) || (page + 1) == MY_UCA_NPAGES)
- fprintf(outfile, "\n");
- }
- fprintf(outfile, "};\n\n");
- return 0;
- }
- int main(int ac, char **av) {
- char *infilename = nullptr;
- char *outfilename = nullptr;
- OPT_DUMP od = handle_options(ac, av, &infilename, &outfilename);
- if (od == DUMP_ERROR) {
- printf(
- "Usage: uca9dump [ducet|ja|zh] --in_file=[inputfile] "
- "--out_file=[outputfile]\n");
- return 0;
- }
- FILE *infile = fopen(infilename, "rb");
- if (!infile) {
- printf("Can not open the file: %s\n", infilename);
- return 0;
- }
- FILE *outfile = fopen(outfilename, "wb");
- if (!outfile) {
- printf("Can not open the file: %s\n", outfilename);
- fclose(infile);
- return 0;
- }
- MY_UCA *uca = new MY_UCA();
- int pageloaded[MY_UCA_NPAGES];
- memset(uca, 0, sizeof(MY_UCA));
- memset(pageloaded, 0, sizeof(pageloaded));
- switch (od) {
- case DUCET_DUMP:
- dump_ducet(uca, pageloaded, infile, outfile);
- break;
- case JA_DUMP:
- dump_ja_hans(uca, infile, outfile);
- break;
- case ZH_DUMP:
- dump_zh_hans(uca, pageloaded, infile, outfile);
- break;
- default:
- printf(
- "Usage: uca9dump [ducet|ja|zh] --in_file=[inputfile] "
- "--out_file=[outputfile]\n");
- break;
- }
- fclose(infile);
- fclose(outfile);
- delete uca;
- return 0;
- }
|