charset.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. /*
  2. Unix SMB/Netbios implementation.
  3. Version 1.9.
  4. Character set handling
  5. Copyright (C) Andrew Tridgell 1992-1998
  6. This program is free software; you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation; either version 2 of the License, or
  9. (at your option) any later version.
  10. This program is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with this program; if not, write to the Free Software
  16. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  17. */
  18. #define CHARSET_C
  19. #include "includes.h"
  20. extern int DEBUGLEVEL;
  21. /*
  22. * Codepage definitions.
  23. */
  24. #if !defined(KANJI)
  25. /* lower->upper mapping for IBM Code Page 850 - MS-DOS Latin 1 */
  26. unsigned char const cp_850[][4] = {
  27. /* dec col/row oct hex description */
  28. /* 133 08/05 205 85 a grave */
  29. /* 183 11/07 267 B7 A grave */ {0x85,0xB7,1,1},
  30. /* 160 10/00 240 A0 a acute */
  31. /* 181 11/05 265 B5 A acute */ {0xA0,0xB5,1,1},
  32. /* 131 08/03 203 83 a circumflex */
  33. /* 182 11/06 266 B6 A circumflex */ {0x83,0xB6,1,1},
  34. /* 198 12/06 306 C6 a tilde */
  35. /* 199 12/07 307 C7 A tilde */ {0xC6,0xC7,1,1},
  36. /* 132 08/04 204 84 a diaeresis */
  37. /* 142 08/14 216 8E A diaeresis */ {0x84,0x8E,1,1},
  38. /* 134 08/06 206 86 a ring */
  39. /* 143 08/15 217 8F A ring */ {0x86,0x8F,1,1},
  40. /* 145 09/01 221 91 ae diphthong */
  41. /* 146 09/02 222 92 AE diphthong */ {0x91,0x92,1,1},
  42. /* 135 08/07 207 87 c cedilla */
  43. /* 128 08/00 200 80 C cedilla */ {0x87,0x80,1,1},
  44. /* 138 08/10 212 8A e grave */
  45. /* 212 13/04 324 D4 E grave */ {0x8A,0xD4,1,1},
  46. /* 130 08/02 202 82 e acute */
  47. /* 144 09/00 220 90 E acute */ {0x82,0x90,1,1},
  48. /* 136 08/08 210 88 e circumflex */
  49. /* 210 13/02 322 D2 E circumflex */ {0x88,0xD2,1,1},
  50. /* 137 08/09 211 89 e diaeresis */
  51. /* 211 13/03 323 D3 E diaeresis */ {0x89,0xD3,1,1},
  52. /* 141 08/13 215 8D i grave */
  53. /* 222 13/14 336 DE I grave */ {0x8D,0xDE,1,1},
  54. /* 161 10/01 241 A1 i acute */
  55. /* 214 13/06 326 D6 I acute */ {0xA1,0xD6,1,1},
  56. /* 140 08/12 214 8C i circumflex */
  57. /* 215 13/07 327 D7 I circumflex */ {0x8C,0xD7,1,1},
  58. /* 139 08/11 213 8B i diaeresis */
  59. /* 216 13/08 330 D8 I diaeresis */ {0x8B,0xD8,1,1},
  60. /* 208 13/00 320 D0 Icelandic eth */
  61. /* 209 13/01 321 D1 Icelandic Eth */ {0xD0,0xD1,1,1},
  62. /* 164 10/04 244 A4 n tilde */
  63. /* 165 10/05 245 A5 N tilde */ {0xA4,0xA5,1,1},
  64. /* 149 09/05 225 95 o grave */
  65. /* 227 14/03 343 E3 O grave */ {0x95,0xE3,1,1},
  66. /* 162 10/02 242 A2 o acute */
  67. /* 224 14/00 340 E0 O acute */ {0xA2,0xE0,1,1},
  68. /* 147 09/03 223 93 o circumflex */
  69. /* 226 14/02 342 E2 O circumflex */ {0x93,0xE2,1,1},
  70. /* 228 14/04 344 E4 o tilde */
  71. /* 229 14/05 345 E5 O tilde */ {0xE4,0xE5,1,1},
  72. /* 148 09/04 224 94 o diaeresis */
  73. /* 153 09/09 231 99 O diaeresis */ {0x94,0x99,1,1},
  74. /* 155 09/11 233 9B o slash */
  75. /* 157 09/13 235 9D O slash */ {0x9B,0x9D,1,1},
  76. /* 151 09/07 227 97 u grave */
  77. /* 235 14/11 353 EB U grave */ {0x97,0xEB,1,1},
  78. /* 163 10/03 243 A3 u acute */
  79. /* 233 14/09 351 E9 U acute */ {0xA3,0xE9,1,1},
  80. /* 150 09/06 226 96 u circumflex */
  81. /* 234 14/10 352 EA U circumflex */ {0x96,0xEA,1,1},
  82. /* 129 08/01 201 81 u diaeresis */
  83. /* 154 09/10 232 9A U diaeresis */ {0x81,0x9A,1,1},
  84. /* 236 14/12 354 EC y acute */
  85. /* 237 14/13 355 ED Y acute */ {0xEC,0xED,1,1},
  86. /* 231 14/07 347 E7 Icelandic thorn */
  87. /* 232 14/08 350 E8 Icelandic Thorn */ {0xE7,0xE8,1,1},
  88. {0x9C,0,0,0}, /* Pound */
  89. {0,0,0,0}
  90. };
  91. #else /* KANJI */
  92. /* lower->upper mapping for IBM Code Page 932 - MS-DOS Japanese SJIS */
  93. unsigned char const cp_932[][4] = {
  94. {0,0,0,0}
  95. };
  96. #endif /* KANJI */
  97. char xx_dos_char_map[256];
  98. char xx_upper_char_map[256];
  99. char xx_lower_char_map[256];
  100. char *dos_char_map = xx_dos_char_map;
  101. char *upper_char_map = xx_upper_char_map;
  102. char *lower_char_map = xx_lower_char_map;
  103. /*
  104. * This code has been extended to deal with ascynchronous mappings
  105. * like MS-DOS Latin US (Code page 437) where things like :
  106. * a acute are capitalized to 'A', but the reverse mapping
  107. * must not hold true. This allows the filename case insensitive
  108. * matching in do_match() to work, as the DOS/Win95/NT client
  109. * uses 'A' as a mask to match against characters like a acute.
  110. * This is the meaning behind the parameters that allow a
  111. * mapping from lower to upper, but not upper to lower.
  112. */
  113. static void add_dos_char(int lower, BOOL map_lower_to_upper,
  114. int upper, BOOL map_upper_to_lower)
  115. {
  116. lower &= 0xff;
  117. upper &= 0xff;
  118. DEBUGADD( 6, ( "Adding chars 0x%x 0x%x (l->u = %s) (u->l = %s)\n",
  119. lower, upper,
  120. map_lower_to_upper ? "True" : "False",
  121. map_upper_to_lower ? "True" : "False" ) );
  122. if (lower) dos_char_map[lower] = 1;
  123. if (upper) dos_char_map[upper] = 1;
  124. lower_char_map[lower] = (char)lower; /* Define tolower(lower) */
  125. upper_char_map[upper] = (char)upper; /* Define toupper(upper) */
  126. if (lower && upper) {
  127. if(map_upper_to_lower)
  128. lower_char_map[upper] = (char)lower;
  129. if(map_lower_to_upper)
  130. upper_char_map[lower] = (char)upper;
  131. }
  132. }
  133. /****************************************************************************
  134. initialise the charset arrays
  135. ****************************************************************************/
  136. void charset_initialise(void)
  137. {
  138. int i;
  139. #ifdef LC_ALL
  140. /* include <locale.h> in includes.h if available for OS */
  141. /* we take only standard 7-bit ASCII definitions from ctype */
  142. setlocale(LC_ALL,"C");
  143. #endif
  144. for (i= 0;i<=255;i++) {
  145. dos_char_map[i] = 0;
  146. }
  147. for (i=0;i<=127;i++) {
  148. if (isalnum(i) || strchr("._^$~!#%&-{}()@'`",(char)i))
  149. add_dos_char(i,False,0,False);
  150. }
  151. for (i=0; i<=255; i++) {
  152. char c = (char)i;
  153. upper_char_map[i] = lower_char_map[i] = c;
  154. /* Some systems have buggy isupper/islower for characters
  155. above 127. Best not to rely on them. */
  156. if(i < 128) {
  157. if (isupper((int)c)) lower_char_map[i] = tolower(c);
  158. if (islower((int)c)) upper_char_map[i] = toupper(c);
  159. }
  160. }
  161. }
  162. /****************************************************************************
  163. load the client codepage.
  164. ****************************************************************************/
  165. typedef const unsigned char (*codepage_p)[4];
  166. static codepage_p load_client_codepage( int client_codepage )
  167. {
  168. pstring codepage_file_name;
  169. unsigned char buf[8];
  170. FILE *fp = NULL;
  171. SMB_OFF_T size;
  172. codepage_p cp_p = NULL;
  173. SMB_STRUCT_STAT st;
  174. DEBUG(5, ("load_client_codepage: loading codepage %d.\n", client_codepage));
  175. if(strlen(CODEPAGEDIR) + 14 > sizeof(codepage_file_name))
  176. {
  177. DEBUG(0,("load_client_codepage: filename too long to load\n"));
  178. return NULL;
  179. }
  180. pstrcpy(codepage_file_name, CODEPAGEDIR);
  181. pstrcat(codepage_file_name, "/");
  182. pstrcat(codepage_file_name, "codepage.");
  183. slprintf(&codepage_file_name[strlen(codepage_file_name)],
  184. sizeof(pstring)-(strlen(codepage_file_name)+1),
  185. "%03d",
  186. client_codepage);
  187. if(sys_stat(codepage_file_name,&st)!=0)
  188. {
  189. DEBUG(0,("load_client_codepage: filename %s does not exist.\n",
  190. codepage_file_name));
  191. return NULL;
  192. }
  193. /* Check if it is at least big enough to hold the required
  194. data. Should be 2 byte version, 2 byte codepage, 4 byte length,
  195. plus zero or more bytes of data. Note that the data cannot be more
  196. than 4 * MAXCODEPAGELINES bytes.
  197. */
  198. size = st.st_size;
  199. if( size < CODEPAGE_HEADER_SIZE || size > (CODEPAGE_HEADER_SIZE + 4 * MAXCODEPAGELINES))
  200. {
  201. DEBUG(0,("load_client_codepage: file %s is an incorrect size for a \
  202. code page file (size=%d).\n", codepage_file_name, (int)size));
  203. return NULL;
  204. }
  205. /* Read the first 8 bytes of the codepage file - check
  206. the version number and code page number. All the data
  207. is held in little endian format.
  208. */
  209. if((fp = sys_fopen( codepage_file_name, "r")) == NULL)
  210. {
  211. DEBUG(0,("load_client_codepage: cannot open file %s. Error was %s\n",
  212. codepage_file_name, strerror(errno)));
  213. return NULL;
  214. }
  215. if(fread( buf, 1, CODEPAGE_HEADER_SIZE, fp)!=CODEPAGE_HEADER_SIZE)
  216. {
  217. DEBUG(0,("load_client_codepage: cannot read header from file %s. Error was %s\n",
  218. codepage_file_name, strerror(errno)));
  219. goto clean_and_exit;
  220. }
  221. /* Check the version value */
  222. if(SVAL(buf,CODEPAGE_VERSION_OFFSET) != CODEPAGE_FILE_VERSION_ID)
  223. {
  224. DEBUG(0,("load_client_codepage: filename %s has incorrect version id. \
  225. Needed %hu, got %hu.\n",
  226. codepage_file_name, (uint16)CODEPAGE_FILE_VERSION_ID,
  227. SVAL(buf,CODEPAGE_VERSION_OFFSET)));
  228. goto clean_and_exit;
  229. }
  230. /* Check the codepage matches */
  231. if(SVAL(buf,CODEPAGE_CLIENT_CODEPAGE_OFFSET) != (uint16)client_codepage)
  232. {
  233. DEBUG(0,("load_client_codepage: filename %s has incorrect codepage. \
  234. Needed %hu, got %hu.\n",
  235. codepage_file_name, (uint16)client_codepage,
  236. SVAL(buf,CODEPAGE_CLIENT_CODEPAGE_OFFSET)));
  237. goto clean_and_exit;
  238. }
  239. /* Check the length is correct. */
  240. if(IVAL(buf,CODEPAGE_LENGTH_OFFSET) != (size - CODEPAGE_HEADER_SIZE))
  241. {
  242. DEBUG(0,("load_client_codepage: filename %s has incorrect size headers. \
  243. Needed %u, got %u.\n", codepage_file_name, (uint32)(size - CODEPAGE_HEADER_SIZE),
  244. IVAL(buf,CODEPAGE_LENGTH_OFFSET)));
  245. goto clean_and_exit;
  246. }
  247. size -= CODEPAGE_HEADER_SIZE; /* Remove header */
  248. /* Make sure the size is a multiple of 4. */
  249. if((size % 4 ) != 0)
  250. {
  251. DEBUG(0,("load_client_codepage: filename %s has a codepage size not a \
  252. multiple of 4.\n", codepage_file_name));
  253. goto clean_and_exit;
  254. }
  255. /* Allocate space for the code page file and read it all in. */
  256. if((cp_p = (codepage_p)malloc( size + 4 )) == NULL)
  257. {
  258. DEBUG(0,("load_client_codepage: malloc fail.\n"));
  259. goto clean_and_exit;
  260. }
  261. if(fread( (char *)cp_p, 1, size, fp)!=size)
  262. {
  263. DEBUG(0,("load_client_codepage: read fail on file %s. Error was %s.\n",
  264. codepage_file_name, strerror(errno)));
  265. goto clean_and_exit;
  266. }
  267. /* Ensure array is correctly terminated. */
  268. memset(((char *)cp_p) + size, '\0', 4);
  269. fclose(fp);
  270. return cp_p;
  271. clean_and_exit:
  272. /* pseudo destructor :-) */
  273. if(fp != NULL)
  274. fclose(fp);
  275. if(cp_p)
  276. free((char *)cp_p);
  277. return NULL;
  278. }
  279. /****************************************************************************
  280. initialise the client codepage.
  281. ****************************************************************************/
  282. void codepage_initialise(int client_codepage)
  283. {
  284. int i;
  285. static codepage_p cp = NULL;
  286. if(cp != NULL)
  287. {
  288. DEBUG(6,
  289. ("codepage_initialise: called twice - ignoring second client code page = %d\n",
  290. client_codepage));
  291. return;
  292. }
  293. DEBUG(6,("codepage_initialise: client code page = %d\n", client_codepage));
  294. /*
  295. * Known client codepages - these can be added to.
  296. */
  297. cp = load_client_codepage( client_codepage );
  298. if(cp == NULL)
  299. {
  300. #ifdef KANJI
  301. DEBUG(6,("codepage_initialise: loading dynamic codepage file %s/codepage.%d \
  302. for code page %d failed. Using default client codepage 932\n",
  303. CODEPAGEDIR, client_codepage, client_codepage));
  304. cp = cp_932;
  305. client_codepage = KANJI_CODEPAGE;
  306. #else /* KANJI */
  307. DEBUG(6,("codepage_initialise: loading dynamic codepage file %s/codepage.%d \
  308. for code page %d failed. Using default client codepage 850\n",
  309. CODEPAGEDIR, client_codepage, client_codepage));
  310. cp = cp_850;
  311. client_codepage = MSDOS_LATIN_1_CODEPAGE;
  312. #endif /* KANJI */
  313. }
  314. /*
  315. * Setup the function pointers for the loaded codepage.
  316. */
  317. initialize_multibyte_vectors( client_codepage );
  318. if(cp)
  319. {
  320. for(i = 0; !((cp[i][0] == '\0') && (cp[i][1] == '\0')); i++)
  321. add_dos_char(cp[i][0], (BOOL)cp[i][2], cp[i][1], (BOOL)cp[i][3]);
  322. }
  323. }
  324. /*******************************************************************
  325. add characters depending on a string passed by the user
  326. ********************************************************************/
  327. void add_char_string(const char *s)
  328. {
  329. char *extra_chars = (char *)strdup(s);
  330. char *t;
  331. if (!extra_chars) return;
  332. for (t=strtok(extra_chars," \t\r\n"); t; t=strtok(NULL," \t\r\n")) {
  333. char c1=0,c2=0;
  334. int i1=0,i2=0;
  335. if (isdigit((unsigned char)*t) || (*t)=='-') {
  336. sscanf(t,"%i:%i",&i1,&i2);
  337. add_dos_char(i1,True,i2,True);
  338. } else {
  339. sscanf(t,"%c:%c",&c1,&c2);
  340. add_dos_char((unsigned char)c1,True,(unsigned char)c2, True);
  341. }
  342. }
  343. free(extra_chars);
  344. }