genpages.c 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. /**
  2. * @file genpages.c
  3. * @brief generate required font page files
  4. * @author Yunhui Fu (yhfudev@gmail.com)
  5. * @version 1.0
  6. * @date 2015-02-19
  7. * @copyright Yunhui Fu (2015)
  8. */
  9. #include <stdio.h>
  10. #include <stdint.h> /* uint8_t */
  11. #include <stdlib.h> /* size_t */
  12. #include <string.h>
  13. #include <assert.h>
  14. #include "getline.h"
  15. wchar_t get_val_utf82uni(uint8_t *pstart) {
  16. size_t cntleft;
  17. wchar_t retval = 0;
  18. if (0 == (0x80 & *pstart)) return *pstart;
  19. if (((*pstart & 0xE0) ^ 0xC0) == 0) {
  20. cntleft = 1;
  21. retval = *pstart & ~0xE0;
  22. }
  23. else if (((*pstart & 0xF0) ^ 0xE0) == 0) {
  24. cntleft = 2;
  25. retval = *pstart & ~0xF0;
  26. }
  27. else if (((*pstart & 0xF8) ^ 0xF0) == 0) {
  28. cntleft = 3;
  29. retval = *pstart & ~0xF8;
  30. }
  31. else if (((*pstart & 0xFC) ^ 0xF8) == 0) {
  32. cntleft = 4;
  33. retval = *pstart & ~0xFC;
  34. }
  35. else if (((*pstart & 0xFE) ^ 0xFC) == 0) {
  36. cntleft = 5;
  37. retval = *pstart & ~0xFE;
  38. }
  39. else {
  40. /* encoding error */
  41. cntleft = 0;
  42. retval = 0;
  43. }
  44. pstart++;
  45. for (; cntleft > 0; cntleft --) {
  46. retval <<= 6;
  47. retval |= *pstart & 0x3F;
  48. pstart++;
  49. }
  50. return retval;
  51. }
  52. /**
  53. * @brief 转换 UTF-8 编码的一个字符为本地的 Unicode 字符(wchar_t)
  54. *
  55. * @param pstart : 存储 UTF-8 字符的指针
  56. * @param pval : 需要返回的 Unicode 字符存放地址指针
  57. *
  58. * @return 成功返回下个 UTF-8 字符的位置
  59. *
  60. * 转换 UTF-8 编码的一个字符为本地的 Unicode 字符(wchar_t)
  61. */
  62. uint8_t* get_utf8_value(uint8_t *pstart, wchar_t *pval) {
  63. uint32_t val = 0;
  64. uint8_t *p = pstart;
  65. /*size_t maxlen = strlen(pstart);*/
  66. assert(NULL != pstart);
  67. if (0 == (0x80 & *p)) {
  68. val = (size_t)*p;
  69. p++;
  70. }
  71. else if (0xC0 == (0xE0 & *p)) {
  72. val = *p & 0x1F;
  73. val <<= 6;
  74. p++;
  75. val |= (*p & 0x3F);
  76. p++;
  77. assert((wchar_t)val == get_val_utf82uni(pstart));
  78. }
  79. else if (0xE0 == (0xF0 & *p)) {
  80. val = *p & 0x0F;
  81. val <<= 6; p++;
  82. val |= (*p & 0x3F);
  83. val <<= 6; p++;
  84. val |= (*p & 0x3F);
  85. p++;
  86. assert((wchar_t)val == get_val_utf82uni(pstart));
  87. }
  88. else if (0xF0 == (0xF8 & *p)) {
  89. val = *p & 0x07;
  90. val <<= 6; p++;
  91. val |= (*p & 0x3F);
  92. val <<= 6; p++;
  93. val |= (*p & 0x3F);
  94. val <<= 6; p++;
  95. val |= (*p & 0x3F);
  96. p++;
  97. assert((wchar_t)val == get_val_utf82uni(pstart));
  98. }
  99. else if (0xF8 == (0xFC & *p)) {
  100. val = *p & 0x03;
  101. val <<= 6; p++;
  102. val |= (*p & 0x3F);
  103. val <<= 6; p++;
  104. val |= (*p & 0x3F);
  105. val <<= 6; p++;
  106. val |= (*p & 0x3F);
  107. val <<= 6; p++;
  108. val |= (*p & 0x3F);
  109. p++;
  110. assert((wchar_t)val == get_val_utf82uni(pstart));
  111. }
  112. else if (0xFC == (0xFE & *p)) {
  113. val = *p & 0x01;
  114. val <<= 6; p++;
  115. val |= (*p & 0x3F);
  116. val <<= 6; p++;
  117. val |= (*p & 0x3F);
  118. val <<= 6; p++;
  119. val |= (*p & 0x3F);
  120. val <<= 6; p++;
  121. val |= (*p & 0x3F);
  122. val <<= 6; p++;
  123. val |= (*p & 0x3F);
  124. p++;
  125. assert((wchar_t)val == get_val_utf82uni(pstart));
  126. }
  127. else if (0x80 == (0xC0 & *p)) {
  128. /* error? */
  129. for (; 0x80 == (0xC0 & *p); p++);
  130. }
  131. else {
  132. /* error */
  133. for (; ((0xFE & *p) > 0xFC); p++);
  134. }
  135. /*
  136. if (val == 0) {
  137. p = NULL;
  138. */
  139. /*
  140. }
  141. else if (pstart + maxlen < p) {
  142. p = pstart;
  143. if (pval) *pval = 0;
  144. }
  145. */
  146. if (pval) *pval = val;
  147. return p;
  148. }
  149. void usage(char* progname) {
  150. fprintf(stderr, "usage: %s\n", progname);
  151. fprintf(stderr, " read data from stdin\n");
  152. }
  153. void utf8_parse(const char* msg, unsigned int len) {
  154. uint8_t *pend = NULL;
  155. uint8_t *p;
  156. uint8_t *pre;
  157. wchar_t val;
  158. int page;
  159. pend = (uint8_t *)msg + len;
  160. for (pre = (uint8_t *)msg; pre < pend;) {
  161. val = 0;
  162. p = get_utf8_value(pre, &val);
  163. if (NULL == p) break;
  164. page = val / 128;
  165. if (val >= 256) {
  166. fprintf(stdout, "%d %d ", page, (val % 128));
  167. for (; pre < p; pre++) fprintf(stdout, "%c", *pre);
  168. fprintf(stdout, "\n");
  169. }
  170. pre = p;
  171. }
  172. }
  173. int load_file(FILE *fp) {
  174. char * buffer = NULL;
  175. size_t szbuf = 0;
  176. szbuf = 10000;
  177. buffer = (char*)malloc(szbuf);
  178. if (NULL == buffer) return -1;
  179. //pos = ftell (fp);
  180. while (getline( &buffer, &szbuf, fp ) > 0)
  181. utf8_parse((const char*)buffer, (unsigned int)strlen ((char *)buffer));
  182. free(buffer);
  183. return 0;
  184. }
  185. int main(int argc, char * argv[]) {
  186. if (argc > 1) {
  187. usage(argv[0]);
  188. exit(1);
  189. }
  190. load_file(stdin);
  191. }