gentranslit.c 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. /* Copyright (C) 1999-2003, 2005 Free Software Foundation, Inc.
  2. This file is part of the GNU LIBICONV Library.
  3. The GNU LIBICONV Library is free software; you can redistribute it
  4. and/or modify it under the terms of the GNU Library General Public
  5. License as published by the Free Software Foundation; either version 2
  6. of the License, or (at your option) any later version.
  7. The GNU LIBICONV Library is distributed in the hope that it will be
  8. useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. Library General Public License for more details.
  11. You should have received a copy of the GNU Library General Public
  12. License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  13. If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
  14. Fifth Floor, Boston, MA 02110-1301, USA. */
  15. /*
  16. * Generates a table of small strings, used for transliteration, from a table
  17. * containing lines of the form
  18. * Unicode <tab> utf-8 replacement <tab> # comment
  19. */
  20. #include <stdio.h>
  21. #include <stdlib.h>
  22. #include <stdbool.h>
  23. int main (int argc, char *argv[])
  24. {
  25. unsigned int data[0x100000];
  26. int uni2index[0x110000];
  27. int index;
  28. if (argc != 1)
  29. exit(1);
  30. printf("/*\n");
  31. printf(" * Copyright (C) 1999-2003 Free Software Foundation, Inc.\n");
  32. printf(" * This file is part of the GNU LIBICONV Library.\n");
  33. printf(" *\n");
  34. printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
  35. printf(" * and/or modify it under the terms of the GNU Library General Public\n");
  36. printf(" * License as published by the Free Software Foundation; either version 2\n");
  37. printf(" * of the License, or (at your option) any later version.\n");
  38. printf(" *\n");
  39. printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
  40. printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
  41. printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
  42. printf(" * Library General Public License for more details.\n");
  43. printf(" *\n");
  44. printf(" * You should have received a copy of the GNU Library General Public\n");
  45. printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
  46. printf(" * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n");
  47. printf(" * Fifth Floor, Boston, MA 02110-1301, USA.\n");
  48. printf(" */\n");
  49. printf("\n");
  50. printf("/*\n");
  51. printf(" * Transliteration table\n");
  52. printf(" */\n");
  53. printf("\n");
  54. {
  55. int c;
  56. int j;
  57. for (j = 0; j < 0x110000; j++)
  58. uni2index[j] = -1;
  59. index = 0;
  60. for (;;) {
  61. c = getc(stdin);
  62. if (c == EOF)
  63. break;
  64. if (c == '#') {
  65. do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
  66. continue;
  67. }
  68. ungetc(c,stdin);
  69. if (scanf("%x",&j) != 1)
  70. exit(1);
  71. c = getc(stdin);
  72. if (c != '\t')
  73. exit(1);
  74. for (;;) {
  75. c = getc(stdin);
  76. if (c == EOF || c == '\n')
  77. exit(1);
  78. if (c == '\t')
  79. break;
  80. if (uni2index[j] < 0) {
  81. uni2index[j] = index;
  82. data[index++] = 0;
  83. }
  84. if (c >= 0x80) {
  85. /* Finish reading an UTF-8 character. */
  86. if (c < 0xc0)
  87. exit(1);
  88. else {
  89. unsigned int i = (c < 0xe0 ? 2 : c < 0xf0 ? 3 : c < 0xf8 ? 4 : c < 0xfc ? 5 : 6);
  90. c &= (1 << (8-i)) - 1;
  91. while (--i > 0) {
  92. int cc = getc(stdin);
  93. if (!(cc >= 0x80 && cc < 0xc0))
  94. exit(1);
  95. c <<= 6; c |= (cc & 0x3f);
  96. }
  97. }
  98. }
  99. data[index++] = (unsigned int) c;
  100. }
  101. if (uni2index[j] >= 0)
  102. data[uni2index[j]] = index - uni2index[j] - 1;
  103. do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
  104. }
  105. }
  106. printf("static const unsigned int translit_data[%d] = {",index);
  107. {
  108. int i;
  109. for (i = 0; i < index; i++) {
  110. if (data[i] < 32)
  111. printf("\n %3d,",data[i]);
  112. else if (data[i] == '\'')
  113. printf("'\\'',");
  114. else if (data[i] == '\\')
  115. printf("'\\\\',");
  116. else if (data[i] < 127)
  117. printf(" '%c',",data[i]);
  118. else if (data[i] < 256)
  119. printf("0x%02X,",data[i]);
  120. else
  121. printf("0x%04X,",data[i]);
  122. }
  123. printf("\n};\n");
  124. }
  125. printf("\n");
  126. {
  127. bool pages[0x1100];
  128. int line[0x22000];
  129. int tableno;
  130. struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
  131. int i, j, p, j1, j2, t;
  132. for (p = 0; p < 0x1100; p++)
  133. pages[p] = false;
  134. for (j = 0; j < 0x110000; j++)
  135. if (uni2index[j] >= 0)
  136. pages[j>>8] = true;
  137. for (j1 = 0; j1 < 0x22000; j1++) {
  138. bool all_invalid = true;
  139. for (j2 = 0; j2 < 8; j2++) {
  140. j = 8*j1+j2;
  141. if (uni2index[j] >= 0)
  142. all_invalid = false;
  143. }
  144. if (all_invalid)
  145. line[j1] = -1;
  146. else
  147. line[j1] = 0;
  148. }
  149. tableno = 0;
  150. for (j1 = 0; j1 < 0x22000; j1++) {
  151. if (line[j1] >= 0) {
  152. if (tableno > 0
  153. && ((j1 > 0 && line[j1-1] == tableno-1)
  154. || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
  155. && j1 - tables[tableno-1].maxline <= 8))) {
  156. line[j1] = tableno-1;
  157. tables[tableno-1].maxline = j1;
  158. } else {
  159. tableno++;
  160. line[j1] = tableno-1;
  161. tables[tableno-1].minline = tables[tableno-1].maxline = j1;
  162. }
  163. }
  164. }
  165. for (t = 0; t < tableno; t++) {
  166. tables[t].usecount = 0;
  167. j1 = 8*tables[t].minline;
  168. j2 = 8*(tables[t].maxline+1);
  169. for (j = j1; j < j2; j++)
  170. if (uni2index[j] >= 0)
  171. tables[t].usecount++;
  172. }
  173. for (t = 0, p = -1, i = 0; t < tableno; t++) {
  174. if (tables[t].usecount > 1) {
  175. char* s;
  176. if (p == tables[t].minline >> 5) {
  177. s = (char*) malloc(5+1);
  178. sprintf(s, "%02x_%d", p, ++i);
  179. } else {
  180. p = tables[t].minline >> 5;
  181. s = (char*) malloc(2+1);
  182. sprintf(s, "%02x", p);
  183. }
  184. tables[t].suffix = s;
  185. } else
  186. tables[t].suffix = NULL;
  187. }
  188. {
  189. p = -1;
  190. for (t = 0; t < tableno; t++)
  191. if (tables[t].usecount > 1) {
  192. p = 0;
  193. printf("static const short translit_page%s[%d] = {\n", tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
  194. for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
  195. if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
  196. printf(" /* 0x%04x */\n", 8*j1);
  197. printf(" ");
  198. for (j2 = 0; j2 < 8; j2++) {
  199. j = 8*j1+j2;
  200. printf(" %4d,", uni2index[j]);
  201. }
  202. printf(" /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
  203. }
  204. printf("};\n");
  205. }
  206. if (p >= 0)
  207. printf("\n");
  208. }
  209. printf("#define translit_index(wc) \\\n (");
  210. for (j1 = 0; j1 < 0x22000;) {
  211. t = line[j1];
  212. for (j2 = j1; j2 < 0x22000 && line[j2] == t; j2++);
  213. if (t >= 0) {
  214. if (j1 != tables[t].minline) abort();
  215. if (j2 > tables[t].maxline+1) abort();
  216. j2 = tables[t].maxline+1;
  217. }
  218. if (t == -1) {
  219. } else {
  220. if (t >= 0 && tables[t].usecount == 0) abort();
  221. if (t >= 0 && tables[t].usecount == 1) {
  222. if (j2 != j1+1) abort();
  223. for (j = 8*j1; j < 8*j2; j++)
  224. if (uni2index[j] >= 0) {
  225. printf("wc == 0x%04x ? %d", j, uni2index[j]);
  226. break;
  227. }
  228. } else {
  229. if (j1 == 0) {
  230. printf("wc < 0x%04x", 8*j2);
  231. } else {
  232. printf("wc >= 0x%04x && wc < 0x%04x", 8*j1, 8*j2);
  233. }
  234. printf(" ? translit_page%s[wc", tables[t].suffix);
  235. if (tables[t].minline > 0)
  236. printf("-0x%04x", 8*j1);
  237. printf("]");
  238. }
  239. printf(" : \\\n ");
  240. }
  241. j1 = j2;
  242. }
  243. printf("-1)\n");
  244. }
  245. if (ferror(stdout) || fclose(stdout))
  246. exit(1);
  247. exit(0);
  248. }