parserInternals.c 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166
  1. /*
  2. * parserInternals.c : Internal routines (and obsolete ones) needed for the
  3. * XML and HTML parsers.
  4. *
  5. * See Copyright for the status of this software.
  6. *
  7. * daniel@veillard.com
  8. */
  9. #define IN_LIBXML
  10. #include "libxml.h"
  11. #if defined(_WIN32) && !defined (__CYGWIN__)
  12. #define XML_DIR_SEP '\\'
  13. #else
  14. #define XML_DIR_SEP '/'
  15. #endif
  16. #include <string.h>
  17. #ifdef HAVE_CTYPE_H
  18. #include <ctype.h>
  19. #endif
  20. #ifdef HAVE_STDLIB_H
  21. #include <stdlib.h>
  22. #endif
  23. #ifdef HAVE_SYS_STAT_H
  24. #include <sys/stat.h>
  25. #endif
  26. #ifdef HAVE_FCNTL_H
  27. #include <fcntl.h>
  28. #endif
  29. #ifdef HAVE_UNISTD_H
  30. #include <unistd.h>
  31. #endif
  32. #ifdef LIBXML_ZLIB_ENABLED
  33. #include <zlib.h>
  34. #endif
  35. #include <libxml/xmlmemory.h>
  36. #include <libxml/tree.h>
  37. #include <libxml/parser.h>
  38. #include <libxml/parserInternals.h>
  39. #include <libxml/valid.h>
  40. #include <libxml/entities.h>
  41. #include <libxml/xmlerror.h>
  42. #include <libxml/encoding.h>
  43. #include <libxml/valid.h>
  44. #include <libxml/xmlIO.h>
  45. #include <libxml/uri.h>
  46. #include <libxml/dict.h>
  47. #include <libxml/SAX.h>
  48. #ifdef LIBXML_CATALOG_ENABLED
  49. #include <libxml/catalog.h>
  50. #endif
  51. #include <libxml/globals.h>
  52. #include <libxml/chvalid.h>
  53. #define CUR(ctxt) ctxt->input->cur
  54. #define END(ctxt) ctxt->input->end
  55. #define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))
  56. #include "buf.h"
  57. #include "enc.h"
  58. /*
  59. * Various global defaults for parsing
  60. */
  61. /**
  62. * xmlCheckVersion:
  63. * @version: the include version number
  64. *
  65. * check the compiled lib version against the include one.
  66. * This can warn or immediately kill the application
  67. */
  68. void
  69. xmlCheckVersion(int version) {
  70. int myversion = (int) LIBXML_VERSION;
  71. xmlInitParser();
  72. if ((myversion / 10000) != (version / 10000)) {
  73. xmlGenericError(xmlGenericErrorContext,
  74. "Fatal: program compiled against libxml %d using libxml %d\n",
  75. (version / 10000), (myversion / 10000));
  76. fprintf(stderr,
  77. "Fatal: program compiled against libxml %d using libxml %d\n",
  78. (version / 10000), (myversion / 10000));
  79. }
  80. if ((myversion / 100) < (version / 100)) {
  81. xmlGenericError(xmlGenericErrorContext,
  82. "Warning: program compiled against libxml %d using older %d\n",
  83. (version / 100), (myversion / 100));
  84. }
  85. }
  86. /************************************************************************
  87. * *
  88. * Some factorized error routines *
  89. * *
  90. ************************************************************************/
  91. /**
  92. * xmlErrMemory:
  93. * @ctxt: an XML parser context
  94. * @extra: extra information
  95. *
  96. * Handle a redefinition of attribute error
  97. */
  98. void
  99. xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  100. {
  101. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  102. (ctxt->instate == XML_PARSER_EOF))
  103. return;
  104. if (ctxt != NULL) {
  105. ctxt->errNo = XML_ERR_NO_MEMORY;
  106. ctxt->instate = XML_PARSER_EOF;
  107. ctxt->disableSAX = 1;
  108. }
  109. if (extra)
  110. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  111. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  112. NULL, NULL, 0, 0,
  113. "Memory allocation failed : %s\n", extra);
  114. else
  115. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  116. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  117. NULL, NULL, 0, 0, "Memory allocation failed\n");
  118. }
  119. /**
  120. * __xmlErrEncoding:
  121. * @ctxt: an XML parser context
  122. * @xmlerr: the error number
  123. * @msg: the error message
  124. * @str1: an string info
  125. * @str2: an string info
  126. *
  127. * Handle an encoding error
  128. */
  129. void
  130. __xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr,
  131. const char *msg, const xmlChar * str1, const xmlChar * str2)
  132. {
  133. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  134. (ctxt->instate == XML_PARSER_EOF))
  135. return;
  136. if (ctxt != NULL)
  137. ctxt->errNo = xmlerr;
  138. __xmlRaiseError(NULL, NULL, NULL,
  139. ctxt, NULL, XML_FROM_PARSER, xmlerr, XML_ERR_FATAL,
  140. NULL, 0, (const char *) str1, (const char *) str2,
  141. NULL, 0, 0, msg, str1, str2);
  142. if (ctxt != NULL) {
  143. ctxt->wellFormed = 0;
  144. if (ctxt->recovery == 0)
  145. ctxt->disableSAX = 1;
  146. }
  147. }
  148. /**
  149. * xmlErrInternal:
  150. * @ctxt: an XML parser context
  151. * @msg: the error message
  152. * @str: error information
  153. *
  154. * Handle an internal error
  155. */
  156. static void LIBXML_ATTR_FORMAT(2,0)
  157. xmlErrInternal(xmlParserCtxtPtr ctxt, const char *msg, const xmlChar * str)
  158. {
  159. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  160. (ctxt->instate == XML_PARSER_EOF))
  161. return;
  162. if (ctxt != NULL)
  163. ctxt->errNo = XML_ERR_INTERNAL_ERROR;
  164. __xmlRaiseError(NULL, NULL, NULL,
  165. ctxt, NULL, XML_FROM_PARSER, XML_ERR_INTERNAL_ERROR,
  166. XML_ERR_FATAL, NULL, 0, (const char *) str, NULL, NULL,
  167. 0, 0, msg, str);
  168. if (ctxt != NULL) {
  169. ctxt->wellFormed = 0;
  170. if (ctxt->recovery == 0)
  171. ctxt->disableSAX = 1;
  172. }
  173. }
  174. /**
  175. * xmlErrEncodingInt:
  176. * @ctxt: an XML parser context
  177. * @error: the error number
  178. * @msg: the error message
  179. * @val: an integer value
  180. *
  181. * n encoding error
  182. */
  183. static void LIBXML_ATTR_FORMAT(3,0)
  184. xmlErrEncodingInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  185. const char *msg, int val)
  186. {
  187. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  188. (ctxt->instate == XML_PARSER_EOF))
  189. return;
  190. if (ctxt != NULL)
  191. ctxt->errNo = error;
  192. __xmlRaiseError(NULL, NULL, NULL,
  193. ctxt, NULL, XML_FROM_PARSER, error, XML_ERR_FATAL,
  194. NULL, 0, NULL, NULL, NULL, val, 0, msg, val);
  195. if (ctxt != NULL) {
  196. ctxt->wellFormed = 0;
  197. if (ctxt->recovery == 0)
  198. ctxt->disableSAX = 1;
  199. }
  200. }
  201. /**
  202. * xmlIsLetter:
  203. * @c: an unicode character (int)
  204. *
  205. * Check whether the character is allowed by the production
  206. * [84] Letter ::= BaseChar | Ideographic
  207. *
  208. * Returns 0 if not, non-zero otherwise
  209. */
  210. int
  211. xmlIsLetter(int c) {
  212. return(IS_BASECHAR(c) || IS_IDEOGRAPHIC(c));
  213. }
  214. /************************************************************************
  215. * *
  216. * Input handling functions for progressive parsing *
  217. * *
  218. ************************************************************************/
  219. /* #define DEBUG_INPUT */
  220. /* #define DEBUG_STACK */
  221. /* #define DEBUG_PUSH */
  222. /* we need to keep enough input to show errors in context */
  223. #define LINE_LEN 80
  224. #ifdef DEBUG_INPUT
  225. #define CHECK_BUFFER(in) check_buffer(in)
  226. static
  227. void check_buffer(xmlParserInputPtr in) {
  228. if (in->base != xmlBufContent(in->buf->buffer)) {
  229. xmlGenericError(xmlGenericErrorContext,
  230. "xmlParserInput: base mismatch problem\n");
  231. }
  232. if (in->cur < in->base) {
  233. xmlGenericError(xmlGenericErrorContext,
  234. "xmlParserInput: cur < base problem\n");
  235. }
  236. if (in->cur > in->base + xmlBufUse(in->buf->buffer)) {
  237. xmlGenericError(xmlGenericErrorContext,
  238. "xmlParserInput: cur > base + use problem\n");
  239. }
  240. xmlGenericError(xmlGenericErrorContext,"buffer %x : content %x, cur %d, use %d\n",
  241. (int) in, (int) xmlBufContent(in->buf->buffer), in->cur - in->base,
  242. xmlBufUse(in->buf->buffer));
  243. }
  244. #else
  245. #define CHECK_BUFFER(in)
  246. #endif
  247. /**
  248. * xmlParserInputRead:
  249. * @in: an XML parser input
  250. * @len: an indicative size for the lookahead
  251. *
  252. * This function was internal and is deprecated.
  253. *
  254. * Returns -1 as this is an error to use it.
  255. */
  256. int
  257. xmlParserInputRead(xmlParserInputPtr in ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED) {
  258. return(-1);
  259. }
  260. /**
  261. * xmlParserInputGrow:
  262. * @in: an XML parser input
  263. * @len: an indicative size for the lookahead
  264. *
  265. * This function increase the input for the parser. It tries to
  266. * preserve pointers to the input buffer, and keep already read data
  267. *
  268. * Returns the amount of char read, or -1 in case of error, 0 indicate the
  269. * end of this entity
  270. */
  271. int
  272. xmlParserInputGrow(xmlParserInputPtr in, int len) {
  273. int ret;
  274. size_t indx;
  275. const xmlChar *content;
  276. if ((in == NULL) || (len < 0)) return(-1);
  277. #ifdef DEBUG_INPUT
  278. xmlGenericError(xmlGenericErrorContext, "Grow\n");
  279. #endif
  280. if (in->buf == NULL) return(-1);
  281. if (in->base == NULL) return(-1);
  282. if (in->cur == NULL) return(-1);
  283. if (in->buf->buffer == NULL) return(-1);
  284. CHECK_BUFFER(in);
  285. indx = in->cur - in->base;
  286. if (xmlBufUse(in->buf->buffer) > (unsigned int) indx + INPUT_CHUNK) {
  287. CHECK_BUFFER(in);
  288. return(0);
  289. }
  290. if (in->buf->readcallback != NULL) {
  291. ret = xmlParserInputBufferGrow(in->buf, len);
  292. } else
  293. return(0);
  294. /*
  295. * NOTE : in->base may be a "dangling" i.e. freed pointer in this
  296. * block, but we use it really as an integer to do some
  297. * pointer arithmetic. Insure will raise it as a bug but in
  298. * that specific case, that's not !
  299. */
  300. content = xmlBufContent(in->buf->buffer);
  301. if (in->base != content) {
  302. /*
  303. * the buffer has been reallocated
  304. */
  305. indx = in->cur - in->base;
  306. in->base = content;
  307. in->cur = &content[indx];
  308. }
  309. in->end = xmlBufEnd(in->buf->buffer);
  310. CHECK_BUFFER(in);
  311. return(ret);
  312. }
  313. /**
  314. * xmlParserInputShrink:
  315. * @in: an XML parser input
  316. *
  317. * This function removes used input for the parser.
  318. */
  319. void
  320. xmlParserInputShrink(xmlParserInputPtr in) {
  321. size_t used;
  322. size_t ret;
  323. size_t indx;
  324. const xmlChar *content;
  325. #ifdef DEBUG_INPUT
  326. xmlGenericError(xmlGenericErrorContext, "Shrink\n");
  327. #endif
  328. if (in == NULL) return;
  329. if (in->buf == NULL) return;
  330. if (in->base == NULL) return;
  331. if (in->cur == NULL) return;
  332. if (in->buf->buffer == NULL) return;
  333. CHECK_BUFFER(in);
  334. used = in->cur - xmlBufContent(in->buf->buffer);
  335. /*
  336. * Do not shrink on large buffers whose only a tiny fraction
  337. * was consumed
  338. */
  339. if (used > INPUT_CHUNK) {
  340. ret = xmlBufShrink(in->buf->buffer, used - LINE_LEN);
  341. if (ret > 0) {
  342. in->cur -= ret;
  343. in->consumed += ret;
  344. }
  345. in->end = xmlBufEnd(in->buf->buffer);
  346. }
  347. CHECK_BUFFER(in);
  348. if (xmlBufUse(in->buf->buffer) > INPUT_CHUNK) {
  349. return;
  350. }
  351. xmlParserInputBufferRead(in->buf, 2 * INPUT_CHUNK);
  352. content = xmlBufContent(in->buf->buffer);
  353. if (in->base != content) {
  354. /*
  355. * the buffer has been reallocated
  356. */
  357. indx = in->cur - in->base;
  358. in->base = content;
  359. in->cur = &content[indx];
  360. }
  361. in->end = xmlBufEnd(in->buf->buffer);
  362. CHECK_BUFFER(in);
  363. }
  364. /************************************************************************
  365. * *
  366. * UTF8 character input and related functions *
  367. * *
  368. ************************************************************************/
  369. /**
  370. * xmlNextChar:
  371. * @ctxt: the XML parser context
  372. *
  373. * Skip to the next char input char.
  374. */
  375. void
  376. xmlNextChar(xmlParserCtxtPtr ctxt)
  377. {
  378. if ((ctxt == NULL) || (ctxt->instate == XML_PARSER_EOF) ||
  379. (ctxt->input == NULL))
  380. return;
  381. if (!(VALID_CTXT(ctxt))) {
  382. xmlErrInternal(ctxt, "Parser input data memory error\n", NULL);
  383. ctxt->errNo = XML_ERR_INTERNAL_ERROR;
  384. xmlStopParser(ctxt);
  385. return;
  386. }
  387. if ((*ctxt->input->cur == 0) &&
  388. (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
  389. return;
  390. }
  391. if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
  392. const unsigned char *cur;
  393. unsigned char c;
  394. /*
  395. * 2.11 End-of-Line Handling
  396. * the literal two-character sequence "#xD#xA" or a standalone
  397. * literal #xD, an XML processor must pass to the application
  398. * the single character #xA.
  399. */
  400. if (*(ctxt->input->cur) == '\n') {
  401. ctxt->input->line++; ctxt->input->col = 1;
  402. } else
  403. ctxt->input->col++;
  404. /*
  405. * We are supposed to handle UTF8, check it's valid
  406. * From rfc2044: encoding of the Unicode values on UTF-8:
  407. *
  408. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  409. * 0000 0000-0000 007F 0xxxxxxx
  410. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  411. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  412. *
  413. * Check for the 0x110000 limit too
  414. */
  415. cur = ctxt->input->cur;
  416. c = *cur;
  417. if (c & 0x80) {
  418. if (c == 0xC0)
  419. goto encoding_error;
  420. if (cur[1] == 0) {
  421. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  422. cur = ctxt->input->cur;
  423. }
  424. if ((cur[1] & 0xc0) != 0x80)
  425. goto encoding_error;
  426. if ((c & 0xe0) == 0xe0) {
  427. unsigned int val;
  428. if (cur[2] == 0) {
  429. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  430. cur = ctxt->input->cur;
  431. }
  432. if ((cur[2] & 0xc0) != 0x80)
  433. goto encoding_error;
  434. if ((c & 0xf0) == 0xf0) {
  435. if (cur[3] == 0) {
  436. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  437. cur = ctxt->input->cur;
  438. }
  439. if (((c & 0xf8) != 0xf0) ||
  440. ((cur[3] & 0xc0) != 0x80))
  441. goto encoding_error;
  442. /* 4-byte code */
  443. ctxt->input->cur += 4;
  444. val = (cur[0] & 0x7) << 18;
  445. val |= (cur[1] & 0x3f) << 12;
  446. val |= (cur[2] & 0x3f) << 6;
  447. val |= cur[3] & 0x3f;
  448. } else {
  449. /* 3-byte code */
  450. ctxt->input->cur += 3;
  451. val = (cur[0] & 0xf) << 12;
  452. val |= (cur[1] & 0x3f) << 6;
  453. val |= cur[2] & 0x3f;
  454. }
  455. if (((val > 0xd7ff) && (val < 0xe000)) ||
  456. ((val > 0xfffd) && (val < 0x10000)) ||
  457. (val >= 0x110000)) {
  458. xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
  459. "Char 0x%X out of allowed range\n",
  460. val);
  461. }
  462. } else
  463. /* 2-byte code */
  464. ctxt->input->cur += 2;
  465. } else
  466. /* 1-byte code */
  467. ctxt->input->cur++;
  468. } else {
  469. /*
  470. * Assume it's a fixed length encoding (1) with
  471. * a compatible encoding for the ASCII set, since
  472. * XML constructs only use < 128 chars
  473. */
  474. if (*(ctxt->input->cur) == '\n') {
  475. ctxt->input->line++; ctxt->input->col = 1;
  476. } else
  477. ctxt->input->col++;
  478. ctxt->input->cur++;
  479. }
  480. if (*ctxt->input->cur == 0)
  481. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  482. return;
  483. encoding_error:
  484. /*
  485. * If we detect an UTF8 error that probably mean that the
  486. * input encoding didn't get properly advertised in the
  487. * declaration header. Report the error and switch the encoding
  488. * to ISO-Latin-1 (if you don't like this policy, just declare the
  489. * encoding !)
  490. */
  491. if ((ctxt == NULL) || (ctxt->input == NULL) ||
  492. (ctxt->input->end - ctxt->input->cur < 4)) {
  493. __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
  494. "Input is not proper UTF-8, indicate encoding !\n",
  495. NULL, NULL);
  496. } else {
  497. char buffer[150];
  498. snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  499. ctxt->input->cur[0], ctxt->input->cur[1],
  500. ctxt->input->cur[2], ctxt->input->cur[3]);
  501. __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
  502. "Input is not proper UTF-8, indicate encoding !\n%s",
  503. BAD_CAST buffer, NULL);
  504. }
  505. ctxt->charset = XML_CHAR_ENCODING_8859_1;
  506. ctxt->input->cur++;
  507. return;
  508. }
  509. /**
  510. * xmlCurrentChar:
  511. * @ctxt: the XML parser context
  512. * @len: pointer to the length of the char read
  513. *
  514. * The current char value, if using UTF-8 this may actually span multiple
  515. * bytes in the input buffer. Implement the end of line normalization:
  516. * 2.11 End-of-Line Handling
  517. * Wherever an external parsed entity or the literal entity value
  518. * of an internal parsed entity contains either the literal two-character
  519. * sequence "#xD#xA" or a standalone literal #xD, an XML processor
  520. * must pass to the application the single character #xA.
  521. * This behavior can conveniently be produced by normalizing all
  522. * line breaks to #xA on input, before parsing.)
  523. *
  524. * Returns the current char value and its length
  525. */
  526. int
  527. xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
  528. if ((ctxt == NULL) || (len == NULL) || (ctxt->input == NULL)) return(0);
  529. if (ctxt->instate == XML_PARSER_EOF)
  530. return(0);
  531. if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {
  532. *len = 1;
  533. return((int) *ctxt->input->cur);
  534. }
  535. if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
  536. /*
  537. * We are supposed to handle UTF8, check it's valid
  538. * From rfc2044: encoding of the Unicode values on UTF-8:
  539. *
  540. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  541. * 0000 0000-0000 007F 0xxxxxxx
  542. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  543. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  544. *
  545. * Check for the 0x110000 limit too
  546. */
  547. const unsigned char *cur = ctxt->input->cur;
  548. unsigned char c;
  549. unsigned int val;
  550. c = *cur;
  551. if (c & 0x80) {
  552. if (((c & 0x40) == 0) || (c == 0xC0))
  553. goto encoding_error;
  554. if (cur[1] == 0) {
  555. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  556. cur = ctxt->input->cur;
  557. }
  558. if ((cur[1] & 0xc0) != 0x80)
  559. goto encoding_error;
  560. if ((c & 0xe0) == 0xe0) {
  561. if (cur[2] == 0) {
  562. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  563. cur = ctxt->input->cur;
  564. }
  565. if ((cur[2] & 0xc0) != 0x80)
  566. goto encoding_error;
  567. if ((c & 0xf0) == 0xf0) {
  568. if (cur[3] == 0) {
  569. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  570. cur = ctxt->input->cur;
  571. }
  572. if (((c & 0xf8) != 0xf0) ||
  573. ((cur[3] & 0xc0) != 0x80))
  574. goto encoding_error;
  575. /* 4-byte code */
  576. *len = 4;
  577. val = (cur[0] & 0x7) << 18;
  578. val |= (cur[1] & 0x3f) << 12;
  579. val |= (cur[2] & 0x3f) << 6;
  580. val |= cur[3] & 0x3f;
  581. if (val < 0x10000)
  582. goto encoding_error;
  583. } else {
  584. /* 3-byte code */
  585. *len = 3;
  586. val = (cur[0] & 0xf) << 12;
  587. val |= (cur[1] & 0x3f) << 6;
  588. val |= cur[2] & 0x3f;
  589. if (val < 0x800)
  590. goto encoding_error;
  591. }
  592. } else {
  593. /* 2-byte code */
  594. *len = 2;
  595. val = (cur[0] & 0x1f) << 6;
  596. val |= cur[1] & 0x3f;
  597. if (val < 0x80)
  598. goto encoding_error;
  599. }
  600. if (!IS_CHAR(val)) {
  601. xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
  602. "Char 0x%X out of allowed range\n", val);
  603. }
  604. return(val);
  605. } else {
  606. /* 1-byte code */
  607. *len = 1;
  608. if (*ctxt->input->cur == 0)
  609. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  610. if ((*ctxt->input->cur == 0) &&
  611. (ctxt->input->end > ctxt->input->cur)) {
  612. xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
  613. "Char 0x0 out of allowed range\n", 0);
  614. }
  615. if (*ctxt->input->cur == 0xD) {
  616. if (ctxt->input->cur[1] == 0xA) {
  617. ctxt->input->cur++;
  618. }
  619. return(0xA);
  620. }
  621. return((int) *ctxt->input->cur);
  622. }
  623. }
  624. /*
  625. * Assume it's a fixed length encoding (1) with
  626. * a compatible encoding for the ASCII set, since
  627. * XML constructs only use < 128 chars
  628. */
  629. *len = 1;
  630. if (*ctxt->input->cur == 0xD) {
  631. if (ctxt->input->cur[1] == 0xA) {
  632. ctxt->input->cur++;
  633. }
  634. return(0xA);
  635. }
  636. return((int) *ctxt->input->cur);
  637. encoding_error:
  638. /*
  639. * An encoding problem may arise from a truncated input buffer
  640. * splitting a character in the middle. In that case do not raise
  641. * an error but return 0 to indicate an end of stream problem
  642. */
  643. if (ctxt->input->end - ctxt->input->cur < 4) {
  644. *len = 0;
  645. return(0);
  646. }
  647. /*
  648. * If we detect an UTF8 error that probably mean that the
  649. * input encoding didn't get properly advertised in the
  650. * declaration header. Report the error and switch the encoding
  651. * to ISO-Latin-1 (if you don't like this policy, just declare the
  652. * encoding !)
  653. */
  654. {
  655. char buffer[150];
  656. snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  657. ctxt->input->cur[0], ctxt->input->cur[1],
  658. ctxt->input->cur[2], ctxt->input->cur[3]);
  659. __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
  660. "Input is not proper UTF-8, indicate encoding !\n%s",
  661. BAD_CAST buffer, NULL);
  662. }
  663. ctxt->charset = XML_CHAR_ENCODING_8859_1;
  664. *len = 1;
  665. return((int) *ctxt->input->cur);
  666. }
  667. /**
  668. * xmlStringCurrentChar:
  669. * @ctxt: the XML parser context
  670. * @cur: pointer to the beginning of the char
  671. * @len: pointer to the length of the char read
  672. *
  673. * The current char value, if using UTF-8 this may actually span multiple
  674. * bytes in the input buffer.
  675. *
  676. * Returns the current char value and its length
  677. */
  678. int
  679. xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len)
  680. {
  681. if ((len == NULL) || (cur == NULL)) return(0);
  682. if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) {
  683. /*
  684. * We are supposed to handle UTF8, check it's valid
  685. * From rfc2044: encoding of the Unicode values on UTF-8:
  686. *
  687. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  688. * 0000 0000-0000 007F 0xxxxxxx
  689. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  690. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  691. *
  692. * Check for the 0x110000 limit too
  693. */
  694. unsigned char c;
  695. unsigned int val;
  696. c = *cur;
  697. if (c & 0x80) {
  698. if ((cur[1] & 0xc0) != 0x80)
  699. goto encoding_error;
  700. if ((c & 0xe0) == 0xe0) {
  701. if ((cur[2] & 0xc0) != 0x80)
  702. goto encoding_error;
  703. if ((c & 0xf0) == 0xf0) {
  704. if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80))
  705. goto encoding_error;
  706. /* 4-byte code */
  707. *len = 4;
  708. val = (cur[0] & 0x7) << 18;
  709. val |= (cur[1] & 0x3f) << 12;
  710. val |= (cur[2] & 0x3f) << 6;
  711. val |= cur[3] & 0x3f;
  712. } else {
  713. /* 3-byte code */
  714. *len = 3;
  715. val = (cur[0] & 0xf) << 12;
  716. val |= (cur[1] & 0x3f) << 6;
  717. val |= cur[2] & 0x3f;
  718. }
  719. } else {
  720. /* 2-byte code */
  721. *len = 2;
  722. val = (cur[0] & 0x1f) << 6;
  723. val |= cur[1] & 0x3f;
  724. }
  725. if (!IS_CHAR(val)) {
  726. xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
  727. "Char 0x%X out of allowed range\n", val);
  728. }
  729. return (val);
  730. } else {
  731. /* 1-byte code */
  732. *len = 1;
  733. return ((int) *cur);
  734. }
  735. }
  736. /*
  737. * Assume it's a fixed length encoding (1) with
  738. * a compatible encoding for the ASCII set, since
  739. * XML constructs only use < 128 chars
  740. */
  741. *len = 1;
  742. return ((int) *cur);
  743. encoding_error:
  744. /*
  745. * An encoding problem may arise from a truncated input buffer
  746. * splitting a character in the middle. In that case do not raise
  747. * an error but return 0 to indicate an end of stream problem
  748. */
  749. if ((ctxt == NULL) || (ctxt->input == NULL) ||
  750. (ctxt->input->end - ctxt->input->cur < 4)) {
  751. *len = 0;
  752. return(0);
  753. }
  754. /*
  755. * If we detect an UTF8 error that probably mean that the
  756. * input encoding didn't get properly advertised in the
  757. * declaration header. Report the error and switch the encoding
  758. * to ISO-Latin-1 (if you don't like this policy, just declare the
  759. * encoding !)
  760. */
  761. {
  762. char buffer[150];
  763. snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  764. ctxt->input->cur[0], ctxt->input->cur[1],
  765. ctxt->input->cur[2], ctxt->input->cur[3]);
  766. __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
  767. "Input is not proper UTF-8, indicate encoding !\n%s",
  768. BAD_CAST buffer, NULL);
  769. }
  770. *len = 1;
  771. return ((int) *cur);
  772. }
  773. /**
  774. * xmlCopyCharMultiByte:
  775. * @out: pointer to an array of xmlChar
  776. * @val: the char value
  777. *
  778. * append the char value in the array
  779. *
  780. * Returns the number of xmlChar written
  781. */
  782. int
  783. xmlCopyCharMultiByte(xmlChar *out, int val) {
  784. if (out == NULL) return(0);
  785. /*
  786. * We are supposed to handle UTF8, check it's valid
  787. * From rfc2044: encoding of the Unicode values on UTF-8:
  788. *
  789. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  790. * 0000 0000-0000 007F 0xxxxxxx
  791. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  792. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  793. */
  794. if (val >= 0x80) {
  795. xmlChar *savedout = out;
  796. int bits;
  797. if (val < 0x800) { *out++= (val >> 6) | 0xC0; bits= 0; }
  798. else if (val < 0x10000) { *out++= (val >> 12) | 0xE0; bits= 6;}
  799. else if (val < 0x110000) { *out++= (val >> 18) | 0xF0; bits= 12; }
  800. else {
  801. xmlErrEncodingInt(NULL, XML_ERR_INVALID_CHAR,
  802. "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n",
  803. val);
  804. return(0);
  805. }
  806. for ( ; bits >= 0; bits-= 6)
  807. *out++= ((val >> bits) & 0x3F) | 0x80 ;
  808. return (out - savedout);
  809. }
  810. *out = (xmlChar) val;
  811. return 1;
  812. }
  813. /**
  814. * xmlCopyChar:
  815. * @len: Ignored, compatibility
  816. * @out: pointer to an array of xmlChar
  817. * @val: the char value
  818. *
  819. * append the char value in the array
  820. *
  821. * Returns the number of xmlChar written
  822. */
  823. int
  824. xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
  825. if (out == NULL) return(0);
  826. /* the len parameter is ignored */
  827. if (val >= 0x80) {
  828. return(xmlCopyCharMultiByte (out, val));
  829. }
  830. *out = (xmlChar) val;
  831. return 1;
  832. }
  833. /************************************************************************
  834. * *
  835. * Commodity functions to switch encodings *
  836. * *
  837. ************************************************************************/
  838. static int
  839. xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
  840. xmlCharEncodingHandlerPtr handler, int len);
  841. static int
  842. xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
  843. xmlCharEncodingHandlerPtr handler, int len);
  844. /**
  845. * xmlSwitchEncoding:
  846. * @ctxt: the parser context
  847. * @enc: the encoding value (number)
  848. *
  849. * change the input functions when discovering the character encoding
  850. * of a given entity.
  851. *
  852. * Returns 0 in case of success, -1 otherwise
  853. */
  854. int
  855. xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
  856. {
  857. xmlCharEncodingHandlerPtr handler;
  858. int len = -1;
  859. int ret;
  860. if (ctxt == NULL) return(-1);
  861. switch (enc) {
  862. case XML_CHAR_ENCODING_ERROR:
  863. __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
  864. "encoding unknown\n", NULL, NULL);
  865. return(-1);
  866. case XML_CHAR_ENCODING_NONE:
  867. /* let's assume it's UTF-8 without the XML decl */
  868. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  869. return(0);
  870. case XML_CHAR_ENCODING_UTF8:
  871. /* default encoding, no conversion should be needed */
  872. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  873. /*
  874. * Errata on XML-1.0 June 20 2001
  875. * Specific handling of the Byte Order Mark for
  876. * UTF-8
  877. */
  878. if ((ctxt->input != NULL) &&
  879. (ctxt->input->cur[0] == 0xEF) &&
  880. (ctxt->input->cur[1] == 0xBB) &&
  881. (ctxt->input->cur[2] == 0xBF)) {
  882. ctxt->input->cur += 3;
  883. }
  884. return(0);
  885. case XML_CHAR_ENCODING_UTF16LE:
  886. case XML_CHAR_ENCODING_UTF16BE:
  887. /*The raw input characters are encoded
  888. *in UTF-16. As we expect this function
  889. *to be called after xmlCharEncInFunc, we expect
  890. *ctxt->input->cur to contain UTF-8 encoded characters.
  891. *So the raw UTF16 Byte Order Mark
  892. *has also been converted into
  893. *an UTF-8 BOM. Let's skip that BOM.
  894. */
  895. if ((ctxt->input != NULL) && (ctxt->input->cur != NULL) &&
  896. (ctxt->input->cur[0] == 0xEF) &&
  897. (ctxt->input->cur[1] == 0xBB) &&
  898. (ctxt->input->cur[2] == 0xBF)) {
  899. ctxt->input->cur += 3;
  900. }
  901. len = 90;
  902. break;
  903. case XML_CHAR_ENCODING_UCS2:
  904. len = 90;
  905. break;
  906. case XML_CHAR_ENCODING_UCS4BE:
  907. case XML_CHAR_ENCODING_UCS4LE:
  908. case XML_CHAR_ENCODING_UCS4_2143:
  909. case XML_CHAR_ENCODING_UCS4_3412:
  910. len = 180;
  911. break;
  912. case XML_CHAR_ENCODING_EBCDIC:
  913. case XML_CHAR_ENCODING_8859_1:
  914. case XML_CHAR_ENCODING_8859_2:
  915. case XML_CHAR_ENCODING_8859_3:
  916. case XML_CHAR_ENCODING_8859_4:
  917. case XML_CHAR_ENCODING_8859_5:
  918. case XML_CHAR_ENCODING_8859_6:
  919. case XML_CHAR_ENCODING_8859_7:
  920. case XML_CHAR_ENCODING_8859_8:
  921. case XML_CHAR_ENCODING_8859_9:
  922. case XML_CHAR_ENCODING_ASCII:
  923. case XML_CHAR_ENCODING_2022_JP:
  924. case XML_CHAR_ENCODING_SHIFT_JIS:
  925. case XML_CHAR_ENCODING_EUC_JP:
  926. len = 45;
  927. break;
  928. }
  929. handler = xmlGetCharEncodingHandler(enc);
  930. if (handler == NULL) {
  931. /*
  932. * Default handlers.
  933. */
  934. switch (enc) {
  935. case XML_CHAR_ENCODING_ASCII:
  936. /* default encoding, no conversion should be needed */
  937. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  938. return(0);
  939. case XML_CHAR_ENCODING_UTF16LE:
  940. break;
  941. case XML_CHAR_ENCODING_UTF16BE:
  942. break;
  943. case XML_CHAR_ENCODING_UCS4LE:
  944. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  945. "encoding not supported %s\n",
  946. BAD_CAST "USC4 little endian", NULL);
  947. break;
  948. case XML_CHAR_ENCODING_UCS4BE:
  949. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  950. "encoding not supported %s\n",
  951. BAD_CAST "USC4 big endian", NULL);
  952. break;
  953. case XML_CHAR_ENCODING_EBCDIC:
  954. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  955. "encoding not supported %s\n",
  956. BAD_CAST "EBCDIC", NULL);
  957. break;
  958. case XML_CHAR_ENCODING_UCS4_2143:
  959. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  960. "encoding not supported %s\n",
  961. BAD_CAST "UCS4 2143", NULL);
  962. break;
  963. case XML_CHAR_ENCODING_UCS4_3412:
  964. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  965. "encoding not supported %s\n",
  966. BAD_CAST "UCS4 3412", NULL);
  967. break;
  968. case XML_CHAR_ENCODING_UCS2:
  969. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  970. "encoding not supported %s\n",
  971. BAD_CAST "UCS2", NULL);
  972. break;
  973. case XML_CHAR_ENCODING_8859_1:
  974. case XML_CHAR_ENCODING_8859_2:
  975. case XML_CHAR_ENCODING_8859_3:
  976. case XML_CHAR_ENCODING_8859_4:
  977. case XML_CHAR_ENCODING_8859_5:
  978. case XML_CHAR_ENCODING_8859_6:
  979. case XML_CHAR_ENCODING_8859_7:
  980. case XML_CHAR_ENCODING_8859_8:
  981. case XML_CHAR_ENCODING_8859_9:
  982. /*
  983. * We used to keep the internal content in the
  984. * document encoding however this turns being unmaintainable
  985. * So xmlGetCharEncodingHandler() will return non-null
  986. * values for this now.
  987. */
  988. if ((ctxt->inputNr == 1) &&
  989. (ctxt->encoding == NULL) &&
  990. (ctxt->input != NULL) &&
  991. (ctxt->input->encoding != NULL)) {
  992. ctxt->encoding = xmlStrdup(ctxt->input->encoding);
  993. }
  994. ctxt->charset = enc;
  995. return(0);
  996. case XML_CHAR_ENCODING_2022_JP:
  997. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  998. "encoding not supported %s\n",
  999. BAD_CAST "ISO-2022-JP", NULL);
  1000. break;
  1001. case XML_CHAR_ENCODING_SHIFT_JIS:
  1002. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  1003. "encoding not supported %s\n",
  1004. BAD_CAST "Shift_JIS", NULL);
  1005. break;
  1006. case XML_CHAR_ENCODING_EUC_JP:
  1007. __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  1008. "encoding not supported %s\n",
  1009. BAD_CAST "EUC-JP", NULL);
  1010. break;
  1011. default:
  1012. break;
  1013. }
  1014. }
  1015. /*
  1016. * TODO: We could recover from errors in external entities if we
  1017. * didn't stop the parser. But most callers of this function don't
  1018. * check the return value.
  1019. */
  1020. if (handler == NULL) {
  1021. xmlStopParser(ctxt);
  1022. return(-1);
  1023. }
  1024. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  1025. ret = xmlSwitchToEncodingInt(ctxt, handler, len);
  1026. if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
  1027. /*
  1028. * on encoding conversion errors, stop the parser
  1029. */
  1030. xmlStopParser(ctxt);
  1031. ctxt->errNo = XML_I18N_CONV_FAILED;
  1032. }
  1033. return(ret);
  1034. }
  1035. /**
  1036. * xmlSwitchInputEncoding:
  1037. * @ctxt: the parser context
  1038. * @input: the input stream
  1039. * @handler: the encoding handler
  1040. * @len: the number of bytes to convert for the first line or -1
  1041. *
  1042. * change the input functions when discovering the character encoding
  1043. * of a given entity.
  1044. *
  1045. * Returns 0 in case of success, -1 otherwise
  1046. */
  1047. static int
  1048. xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
  1049. xmlCharEncodingHandlerPtr handler, int len)
  1050. {
  1051. int nbchars;
  1052. if (handler == NULL)
  1053. return (-1);
  1054. if (input == NULL)
  1055. return (-1);
  1056. if (input->buf != NULL) {
  1057. if (input->buf->encoder != NULL) {
  1058. /*
  1059. * Check in case the auto encoding detection triggered
  1060. * in already.
  1061. */
  1062. if (input->buf->encoder == handler)
  1063. return (0);
  1064. /*
  1065. * "UTF-16" can be used for both LE and BE
  1066. if ((!xmlStrncmp(BAD_CAST input->buf->encoder->name,
  1067. BAD_CAST "UTF-16", 6)) &&
  1068. (!xmlStrncmp(BAD_CAST handler->name,
  1069. BAD_CAST "UTF-16", 6))) {
  1070. return(0);
  1071. }
  1072. */
  1073. /*
  1074. * Note: this is a bit dangerous, but that's what it
  1075. * takes to use nearly compatible signature for different
  1076. * encodings.
  1077. *
  1078. * FIXME: Encoders might buffer partial byte sequences, so
  1079. * this probably can't work. We should return an error and
  1080. * make sure that callers never try to switch the encoding
  1081. * twice.
  1082. */
  1083. xmlCharEncCloseFunc(input->buf->encoder);
  1084. input->buf->encoder = handler;
  1085. return (0);
  1086. }
  1087. input->buf->encoder = handler;
  1088. /*
  1089. * Is there already some content down the pipe to convert ?
  1090. */
  1091. if (xmlBufIsEmpty(input->buf->buffer) == 0) {
  1092. int processed;
  1093. unsigned int use;
  1094. /*
  1095. * Specific handling of the Byte Order Mark for
  1096. * UTF-16
  1097. */
  1098. if ((handler->name != NULL) &&
  1099. (!strcmp(handler->name, "UTF-16LE") ||
  1100. !strcmp(handler->name, "UTF-16")) &&
  1101. (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) {
  1102. input->cur += 2;
  1103. }
  1104. if ((handler->name != NULL) &&
  1105. (!strcmp(handler->name, "UTF-16BE")) &&
  1106. (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) {
  1107. input->cur += 2;
  1108. }
  1109. /*
  1110. * Errata on XML-1.0 June 20 2001
  1111. * Specific handling of the Byte Order Mark for
  1112. * UTF-8
  1113. */
  1114. if ((handler->name != NULL) &&
  1115. (!strcmp(handler->name, "UTF-8")) &&
  1116. (input->cur[0] == 0xEF) &&
  1117. (input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) {
  1118. input->cur += 3;
  1119. }
  1120. /*
  1121. * Shrink the current input buffer.
  1122. * Move it as the raw buffer and create a new input buffer
  1123. */
  1124. processed = input->cur - input->base;
  1125. xmlBufShrink(input->buf->buffer, processed);
  1126. input->buf->raw = input->buf->buffer;
  1127. input->buf->buffer = xmlBufCreate();
  1128. input->buf->rawconsumed = processed;
  1129. use = xmlBufUse(input->buf->raw);
  1130. if (ctxt->html) {
  1131. /*
  1132. * convert as much as possible of the buffer
  1133. */
  1134. nbchars = xmlCharEncInput(input->buf, 1);
  1135. } else {
  1136. /*
  1137. * convert just enough to get
  1138. * '<?xml version="1.0" encoding="xxx"?>'
  1139. * parsed with the autodetected encoding
  1140. * into the parser reading buffer.
  1141. */
  1142. nbchars = xmlCharEncFirstLineInput(input->buf, len);
  1143. }
  1144. xmlBufResetInput(input->buf->buffer, input);
  1145. if (nbchars < 0) {
  1146. xmlErrInternal(ctxt,
  1147. "switching encoding: encoder error\n",
  1148. NULL);
  1149. return (-1);
  1150. }
  1151. input->buf->rawconsumed += use - xmlBufUse(input->buf->raw);
  1152. }
  1153. return (0);
  1154. } else if (input->length == 0) {
  1155. /*
  1156. * When parsing a static memory array one must know the
  1157. * size to be able to convert the buffer.
  1158. */
  1159. xmlErrInternal(ctxt, "switching encoding : no input\n", NULL);
  1160. /*
  1161. * Callers assume that the input buffer takes ownership of the
  1162. * encoding handler. xmlCharEncCloseFunc frees unregistered
  1163. * handlers and avoids a memory leak.
  1164. */
  1165. xmlCharEncCloseFunc(handler);
  1166. return (-1);
  1167. }
  1168. /*
  1169. * We should actually raise an error here, see issue #34.
  1170. */
  1171. xmlCharEncCloseFunc(handler);
  1172. return (0);
  1173. }
  1174. /**
  1175. * xmlSwitchInputEncoding:
  1176. * @ctxt: the parser context
  1177. * @input: the input stream
  1178. * @handler: the encoding handler
  1179. *
  1180. * change the input functions when discovering the character encoding
  1181. * of a given entity.
  1182. *
  1183. * Returns 0 in case of success, -1 otherwise
  1184. */
  1185. int
  1186. xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
  1187. xmlCharEncodingHandlerPtr handler) {
  1188. return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1));
  1189. }
  1190. /**
  1191. * xmlSwitchToEncodingInt:
  1192. * @ctxt: the parser context
  1193. * @handler: the encoding handler
  1194. * @len: the length to convert or -1
  1195. *
  1196. * change the input functions when discovering the character encoding
  1197. * of a given entity, and convert only @len bytes of the output, this
  1198. * is needed on auto detect to allows any declared encoding later to
  1199. * convert the actual content after the xmlDecl
  1200. *
  1201. * Returns 0 in case of success, -1 otherwise
  1202. */
  1203. static int
  1204. xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
  1205. xmlCharEncodingHandlerPtr handler, int len) {
  1206. int ret = 0;
  1207. if (handler != NULL) {
  1208. if (ctxt->input != NULL) {
  1209. ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
  1210. } else {
  1211. xmlErrInternal(ctxt, "xmlSwitchToEncoding : no input\n",
  1212. NULL);
  1213. return(-1);
  1214. }
  1215. /*
  1216. * The parsing is now done in UTF8 natively
  1217. */
  1218. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  1219. } else
  1220. return(-1);
  1221. return(ret);
  1222. }
  1223. /**
  1224. * xmlSwitchToEncoding:
  1225. * @ctxt: the parser context
  1226. * @handler: the encoding handler
  1227. *
  1228. * change the input functions when discovering the character encoding
  1229. * of a given entity.
  1230. *
  1231. * Returns 0 in case of success, -1 otherwise
  1232. */
  1233. int
  1234. xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
  1235. {
  1236. return (xmlSwitchToEncodingInt(ctxt, handler, -1));
  1237. }
  1238. /************************************************************************
  1239. * *
  1240. * Commodity functions to handle entities processing *
  1241. * *
  1242. ************************************************************************/
  1243. /**
  1244. * xmlFreeInputStream:
  1245. * @input: an xmlParserInputPtr
  1246. *
  1247. * Free up an input stream.
  1248. */
  1249. void
  1250. xmlFreeInputStream(xmlParserInputPtr input) {
  1251. if (input == NULL) return;
  1252. if (input->filename != NULL) xmlFree((char *) input->filename);
  1253. if (input->directory != NULL) xmlFree((char *) input->directory);
  1254. if (input->encoding != NULL) xmlFree((char *) input->encoding);
  1255. if (input->version != NULL) xmlFree((char *) input->version);
  1256. if ((input->free != NULL) && (input->base != NULL))
  1257. input->free((xmlChar *) input->base);
  1258. if (input->buf != NULL)
  1259. xmlFreeParserInputBuffer(input->buf);
  1260. xmlFree(input);
  1261. }
  1262. /**
  1263. * xmlNewInputStream:
  1264. * @ctxt: an XML parser context
  1265. *
  1266. * Create a new input stream structure.
  1267. *
  1268. * Returns the new input stream or NULL
  1269. */
  1270. xmlParserInputPtr
  1271. xmlNewInputStream(xmlParserCtxtPtr ctxt) {
  1272. xmlParserInputPtr input;
  1273. input = (xmlParserInputPtr) xmlMalloc(sizeof(xmlParserInput));
  1274. if (input == NULL) {
  1275. xmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
  1276. return(NULL);
  1277. }
  1278. memset(input, 0, sizeof(xmlParserInput));
  1279. input->line = 1;
  1280. input->col = 1;
  1281. input->standalone = -1;
  1282. /*
  1283. * If the context is NULL the id cannot be initialized, but that
  1284. * should not happen while parsing which is the situation where
  1285. * the id is actually needed.
  1286. */
  1287. if (ctxt != NULL)
  1288. input->id = ctxt->input_id++;
  1289. return(input);
  1290. }
  1291. /**
  1292. * xmlNewIOInputStream:
  1293. * @ctxt: an XML parser context
  1294. * @input: an I/O Input
  1295. * @enc: the charset encoding if known
  1296. *
  1297. * Create a new input stream structure encapsulating the @input into
  1298. * a stream suitable for the parser.
  1299. *
  1300. * Returns the new input stream or NULL
  1301. */
  1302. xmlParserInputPtr
  1303. xmlNewIOInputStream(xmlParserCtxtPtr ctxt, xmlParserInputBufferPtr input,
  1304. xmlCharEncoding enc) {
  1305. xmlParserInputPtr inputStream;
  1306. if (input == NULL) return(NULL);
  1307. if (xmlParserDebugEntities)
  1308. xmlGenericError(xmlGenericErrorContext, "new input from I/O\n");
  1309. inputStream = xmlNewInputStream(ctxt);
  1310. if (inputStream == NULL) {
  1311. return(NULL);
  1312. }
  1313. inputStream->filename = NULL;
  1314. inputStream->buf = input;
  1315. xmlBufResetInput(inputStream->buf->buffer, inputStream);
  1316. if (enc != XML_CHAR_ENCODING_NONE) {
  1317. xmlSwitchEncoding(ctxt, enc);
  1318. }
  1319. return(inputStream);
  1320. }
  1321. /**
  1322. * xmlNewEntityInputStream:
  1323. * @ctxt: an XML parser context
  1324. * @entity: an Entity pointer
  1325. *
  1326. * Create a new input stream based on an xmlEntityPtr
  1327. *
  1328. * Returns the new input stream or NULL
  1329. */
  1330. xmlParserInputPtr
  1331. xmlNewEntityInputStream(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
  1332. xmlParserInputPtr input;
  1333. if (entity == NULL) {
  1334. xmlErrInternal(ctxt, "xmlNewEntityInputStream entity = NULL\n",
  1335. NULL);
  1336. return(NULL);
  1337. }
  1338. if (xmlParserDebugEntities)
  1339. xmlGenericError(xmlGenericErrorContext,
  1340. "new input from entity: %s\n", entity->name);
  1341. if (entity->content == NULL) {
  1342. switch (entity->etype) {
  1343. case XML_EXTERNAL_GENERAL_UNPARSED_ENTITY:
  1344. xmlErrInternal(ctxt, "Cannot parse entity %s\n",
  1345. entity->name);
  1346. break;
  1347. case XML_EXTERNAL_GENERAL_PARSED_ENTITY:
  1348. case XML_EXTERNAL_PARAMETER_ENTITY:
  1349. return(xmlLoadExternalEntity((char *) entity->URI,
  1350. (char *) entity->ExternalID, ctxt));
  1351. case XML_INTERNAL_GENERAL_ENTITY:
  1352. xmlErrInternal(ctxt,
  1353. "Internal entity %s without content !\n",
  1354. entity->name);
  1355. break;
  1356. case XML_INTERNAL_PARAMETER_ENTITY:
  1357. xmlErrInternal(ctxt,
  1358. "Internal parameter entity %s without content !\n",
  1359. entity->name);
  1360. break;
  1361. case XML_INTERNAL_PREDEFINED_ENTITY:
  1362. xmlErrInternal(ctxt,
  1363. "Predefined entity %s without content !\n",
  1364. entity->name);
  1365. break;
  1366. }
  1367. return(NULL);
  1368. }
  1369. input = xmlNewInputStream(ctxt);
  1370. if (input == NULL) {
  1371. return(NULL);
  1372. }
  1373. if (entity->URI != NULL)
  1374. input->filename = (char *) xmlStrdup((xmlChar *) entity->URI);
  1375. input->base = entity->content;
  1376. if (entity->length == 0)
  1377. entity->length = xmlStrlen(entity->content);
  1378. input->cur = entity->content;
  1379. input->length = entity->length;
  1380. input->end = &entity->content[input->length];
  1381. return(input);
  1382. }
  1383. /**
  1384. * xmlNewStringInputStream:
  1385. * @ctxt: an XML parser context
  1386. * @buffer: an memory buffer
  1387. *
  1388. * Create a new input stream based on a memory buffer.
  1389. * Returns the new input stream
  1390. */
  1391. xmlParserInputPtr
  1392. xmlNewStringInputStream(xmlParserCtxtPtr ctxt, const xmlChar *buffer) {
  1393. xmlParserInputPtr input;
  1394. if (buffer == NULL) {
  1395. xmlErrInternal(ctxt, "xmlNewStringInputStream string = NULL\n",
  1396. NULL);
  1397. return(NULL);
  1398. }
  1399. if (xmlParserDebugEntities)
  1400. xmlGenericError(xmlGenericErrorContext,
  1401. "new fixed input: %.30s\n", buffer);
  1402. input = xmlNewInputStream(ctxt);
  1403. if (input == NULL) {
  1404. xmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
  1405. return(NULL);
  1406. }
  1407. input->base = buffer;
  1408. input->cur = buffer;
  1409. input->length = xmlStrlen(buffer);
  1410. input->end = &buffer[input->length];
  1411. return(input);
  1412. }
  1413. /**
  1414. * xmlNewInputFromFile:
  1415. * @ctxt: an XML parser context
  1416. * @filename: the filename to use as entity
  1417. *
  1418. * Create a new input stream based on a file or an URL.
  1419. *
  1420. * Returns the new input stream or NULL in case of error
  1421. */
  1422. xmlParserInputPtr
  1423. xmlNewInputFromFile(xmlParserCtxtPtr ctxt, const char *filename) {
  1424. xmlParserInputBufferPtr buf;
  1425. xmlParserInputPtr inputStream;
  1426. char *directory = NULL;
  1427. xmlChar *URI = NULL;
  1428. if (xmlParserDebugEntities)
  1429. xmlGenericError(xmlGenericErrorContext,
  1430. "new input from file: %s\n", filename);
  1431. if (ctxt == NULL) return(NULL);
  1432. buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
  1433. if (buf == NULL) {
  1434. if (filename == NULL)
  1435. __xmlLoaderErr(ctxt,
  1436. "failed to load external entity: NULL filename \n",
  1437. NULL);
  1438. else
  1439. __xmlLoaderErr(ctxt, "failed to load external entity \"%s\"\n",
  1440. (const char *) filename);
  1441. return(NULL);
  1442. }
  1443. inputStream = xmlNewInputStream(ctxt);
  1444. if (inputStream == NULL) {
  1445. xmlFreeParserInputBuffer(buf);
  1446. return(NULL);
  1447. }
  1448. inputStream->buf = buf;
  1449. inputStream = xmlCheckHTTPInput(ctxt, inputStream);
  1450. if (inputStream == NULL)
  1451. return(NULL);
  1452. if (inputStream->filename == NULL)
  1453. URI = xmlStrdup((xmlChar *) filename);
  1454. else
  1455. URI = xmlStrdup((xmlChar *) inputStream->filename);
  1456. directory = xmlParserGetDirectory((const char *) URI);
  1457. if (inputStream->filename != NULL) xmlFree((char *)inputStream->filename);
  1458. inputStream->filename = (char *) xmlCanonicPath((const xmlChar *) URI);
  1459. if (URI != NULL) xmlFree((char *) URI);
  1460. inputStream->directory = directory;
  1461. xmlBufResetInput(inputStream->buf->buffer, inputStream);
  1462. if ((ctxt->directory == NULL) && (directory != NULL))
  1463. ctxt->directory = (char *) xmlStrdup((const xmlChar *) directory);
  1464. return(inputStream);
  1465. }
  1466. /************************************************************************
  1467. * *
  1468. * Commodity functions to handle parser contexts *
  1469. * *
  1470. ************************************************************************/
  1471. /**
  1472. * xmlInitParserCtxt:
  1473. * @ctxt: an XML parser context
  1474. *
  1475. * Initialize a parser context
  1476. *
  1477. * Returns 0 in case of success and -1 in case of error
  1478. */
  1479. int
  1480. xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
  1481. {
  1482. xmlParserInputPtr input;
  1483. if(ctxt==NULL) {
  1484. xmlErrInternal(NULL, "Got NULL parser context\n", NULL);
  1485. return(-1);
  1486. }
  1487. xmlDefaultSAXHandlerInit();
  1488. if (ctxt->dict == NULL)
  1489. ctxt->dict = xmlDictCreate();
  1490. if (ctxt->dict == NULL) {
  1491. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1492. return(-1);
  1493. }
  1494. xmlDictSetLimit(ctxt->dict, XML_MAX_DICTIONARY_LIMIT);
  1495. if (ctxt->sax == NULL)
  1496. ctxt->sax = (xmlSAXHandler *) xmlMalloc(sizeof(xmlSAXHandler));
  1497. if (ctxt->sax == NULL) {
  1498. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1499. return(-1);
  1500. }
  1501. else
  1502. xmlSAXVersion(ctxt->sax, 2);
  1503. ctxt->maxatts = 0;
  1504. ctxt->atts = NULL;
  1505. /* Allocate the Input stack */
  1506. if (ctxt->inputTab == NULL) {
  1507. ctxt->inputTab = (xmlParserInputPtr *)
  1508. xmlMalloc(5 * sizeof(xmlParserInputPtr));
  1509. ctxt->inputMax = 5;
  1510. }
  1511. if (ctxt->inputTab == NULL) {
  1512. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1513. ctxt->inputNr = 0;
  1514. ctxt->inputMax = 0;
  1515. ctxt->input = NULL;
  1516. return(-1);
  1517. }
  1518. while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
  1519. xmlFreeInputStream(input);
  1520. }
  1521. ctxt->inputNr = 0;
  1522. ctxt->input = NULL;
  1523. ctxt->version = NULL;
  1524. ctxt->encoding = NULL;
  1525. ctxt->standalone = -1;
  1526. ctxt->hasExternalSubset = 0;
  1527. ctxt->hasPErefs = 0;
  1528. ctxt->html = 0;
  1529. ctxt->external = 0;
  1530. ctxt->instate = XML_PARSER_START;
  1531. ctxt->token = 0;
  1532. ctxt->directory = NULL;
  1533. /* Allocate the Node stack */
  1534. if (ctxt->nodeTab == NULL) {
  1535. ctxt->nodeTab = (xmlNodePtr *) xmlMalloc(10 * sizeof(xmlNodePtr));
  1536. ctxt->nodeMax = 10;
  1537. }
  1538. if (ctxt->nodeTab == NULL) {
  1539. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1540. ctxt->nodeNr = 0;
  1541. ctxt->nodeMax = 0;
  1542. ctxt->node = NULL;
  1543. ctxt->inputNr = 0;
  1544. ctxt->inputMax = 0;
  1545. ctxt->input = NULL;
  1546. return(-1);
  1547. }
  1548. ctxt->nodeNr = 0;
  1549. ctxt->node = NULL;
  1550. /* Allocate the Name stack */
  1551. if (ctxt->nameTab == NULL) {
  1552. ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
  1553. ctxt->nameMax = 10;
  1554. }
  1555. if (ctxt->nameTab == NULL) {
  1556. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1557. ctxt->nodeNr = 0;
  1558. ctxt->nodeMax = 0;
  1559. ctxt->node = NULL;
  1560. ctxt->inputNr = 0;
  1561. ctxt->inputMax = 0;
  1562. ctxt->input = NULL;
  1563. ctxt->nameNr = 0;
  1564. ctxt->nameMax = 0;
  1565. ctxt->name = NULL;
  1566. return(-1);
  1567. }
  1568. ctxt->nameNr = 0;
  1569. ctxt->name = NULL;
  1570. /* Allocate the space stack */
  1571. if (ctxt->spaceTab == NULL) {
  1572. ctxt->spaceTab = (int *) xmlMalloc(10 * sizeof(int));
  1573. ctxt->spaceMax = 10;
  1574. }
  1575. if (ctxt->spaceTab == NULL) {
  1576. xmlErrMemory(NULL, "cannot initialize parser context\n");
  1577. ctxt->nodeNr = 0;
  1578. ctxt->nodeMax = 0;
  1579. ctxt->node = NULL;
  1580. ctxt->inputNr = 0;
  1581. ctxt->inputMax = 0;
  1582. ctxt->input = NULL;
  1583. ctxt->nameNr = 0;
  1584. ctxt->nameMax = 0;
  1585. ctxt->name = NULL;
  1586. ctxt->spaceNr = 0;
  1587. ctxt->spaceMax = 0;
  1588. ctxt->space = NULL;
  1589. return(-1);
  1590. }
  1591. ctxt->spaceNr = 1;
  1592. ctxt->spaceMax = 10;
  1593. ctxt->spaceTab[0] = -1;
  1594. ctxt->space = &ctxt->spaceTab[0];
  1595. ctxt->userData = ctxt;
  1596. ctxt->myDoc = NULL;
  1597. ctxt->wellFormed = 1;
  1598. ctxt->nsWellFormed = 1;
  1599. ctxt->valid = 1;
  1600. ctxt->loadsubset = xmlLoadExtDtdDefaultValue;
  1601. if (ctxt->loadsubset) {
  1602. ctxt->options |= XML_PARSE_DTDLOAD;
  1603. }
  1604. ctxt->validate = xmlDoValidityCheckingDefaultValue;
  1605. ctxt->pedantic = xmlPedanticParserDefaultValue;
  1606. if (ctxt->pedantic) {
  1607. ctxt->options |= XML_PARSE_PEDANTIC;
  1608. }
  1609. ctxt->linenumbers = xmlLineNumbersDefaultValue;
  1610. ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
  1611. if (ctxt->keepBlanks == 0) {
  1612. ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
  1613. ctxt->options |= XML_PARSE_NOBLANKS;
  1614. }
  1615. ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
  1616. ctxt->vctxt.userData = ctxt;
  1617. ctxt->vctxt.error = xmlParserValidityError;
  1618. ctxt->vctxt.warning = xmlParserValidityWarning;
  1619. if (ctxt->validate) {
  1620. if (xmlGetWarningsDefaultValue == 0)
  1621. ctxt->vctxt.warning = NULL;
  1622. else
  1623. ctxt->vctxt.warning = xmlParserValidityWarning;
  1624. ctxt->vctxt.nodeMax = 0;
  1625. ctxt->options |= XML_PARSE_DTDVALID;
  1626. }
  1627. ctxt->replaceEntities = xmlSubstituteEntitiesDefaultValue;
  1628. if (ctxt->replaceEntities) {
  1629. ctxt->options |= XML_PARSE_NOENT;
  1630. }
  1631. ctxt->record_info = 0;
  1632. ctxt->checkIndex = 0;
  1633. ctxt->inSubset = 0;
  1634. ctxt->errNo = XML_ERR_OK;
  1635. ctxt->depth = 0;
  1636. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  1637. ctxt->catalogs = NULL;
  1638. ctxt->nbentities = 0;
  1639. ctxt->sizeentities = 0;
  1640. ctxt->sizeentcopy = 0;
  1641. ctxt->input_id = 1;
  1642. xmlInitNodeInfoSeq(&ctxt->node_seq);
  1643. return(0);
  1644. }
  1645. /**
  1646. * xmlFreeParserCtxt:
  1647. * @ctxt: an XML parser context
  1648. *
  1649. * Free all the memory used by a parser context. However the parsed
  1650. * document in ctxt->myDoc is not freed.
  1651. */
  1652. void
  1653. xmlFreeParserCtxt(xmlParserCtxtPtr ctxt)
  1654. {
  1655. xmlParserInputPtr input;
  1656. if (ctxt == NULL) return;
  1657. while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
  1658. xmlFreeInputStream(input);
  1659. }
  1660. if (ctxt->spaceTab != NULL) xmlFree(ctxt->spaceTab);
  1661. if (ctxt->nameTab != NULL) xmlFree((xmlChar * *)ctxt->nameTab);
  1662. if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
  1663. if (ctxt->nodeInfoTab != NULL) xmlFree(ctxt->nodeInfoTab);
  1664. if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
  1665. if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
  1666. if (ctxt->encoding != NULL) xmlFree((char *) ctxt->encoding);
  1667. if (ctxt->extSubURI != NULL) xmlFree((char *) ctxt->extSubURI);
  1668. if (ctxt->extSubSystem != NULL) xmlFree((char *) ctxt->extSubSystem);
  1669. #ifdef LIBXML_SAX1_ENABLED
  1670. if ((ctxt->sax != NULL) &&
  1671. (ctxt->sax != (xmlSAXHandlerPtr) &xmlDefaultSAXHandler))
  1672. #else
  1673. if (ctxt->sax != NULL)
  1674. #endif /* LIBXML_SAX1_ENABLED */
  1675. xmlFree(ctxt->sax);
  1676. if (ctxt->directory != NULL) xmlFree((char *) ctxt->directory);
  1677. if (ctxt->vctxt.nodeTab != NULL) xmlFree(ctxt->vctxt.nodeTab);
  1678. if (ctxt->atts != NULL) xmlFree((xmlChar * *)ctxt->atts);
  1679. if (ctxt->dict != NULL) xmlDictFree(ctxt->dict);
  1680. if (ctxt->nsTab != NULL) xmlFree((char *) ctxt->nsTab);
  1681. if (ctxt->pushTab != NULL) xmlFree(ctxt->pushTab);
  1682. if (ctxt->attallocs != NULL) xmlFree(ctxt->attallocs);
  1683. if (ctxt->attsDefault != NULL)
  1684. xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
  1685. if (ctxt->attsSpecial != NULL)
  1686. xmlHashFree(ctxt->attsSpecial, NULL);
  1687. if (ctxt->freeElems != NULL) {
  1688. xmlNodePtr cur, next;
  1689. cur = ctxt->freeElems;
  1690. while (cur != NULL) {
  1691. next = cur->next;
  1692. xmlFree(cur);
  1693. cur = next;
  1694. }
  1695. }
  1696. if (ctxt->freeAttrs != NULL) {
  1697. xmlAttrPtr cur, next;
  1698. cur = ctxt->freeAttrs;
  1699. while (cur != NULL) {
  1700. next = cur->next;
  1701. xmlFree(cur);
  1702. cur = next;
  1703. }
  1704. }
  1705. /*
  1706. * cleanup the error strings
  1707. */
  1708. if (ctxt->lastError.message != NULL)
  1709. xmlFree(ctxt->lastError.message);
  1710. if (ctxt->lastError.file != NULL)
  1711. xmlFree(ctxt->lastError.file);
  1712. if (ctxt->lastError.str1 != NULL)
  1713. xmlFree(ctxt->lastError.str1);
  1714. if (ctxt->lastError.str2 != NULL)
  1715. xmlFree(ctxt->lastError.str2);
  1716. if (ctxt->lastError.str3 != NULL)
  1717. xmlFree(ctxt->lastError.str3);
  1718. #ifdef LIBXML_CATALOG_ENABLED
  1719. if (ctxt->catalogs != NULL)
  1720. xmlCatalogFreeLocal(ctxt->catalogs);
  1721. #endif
  1722. xmlFree(ctxt);
  1723. }
  1724. /**
  1725. * xmlNewParserCtxt:
  1726. *
  1727. * Allocate and initialize a new parser context.
  1728. *
  1729. * Returns the xmlParserCtxtPtr or NULL
  1730. */
  1731. xmlParserCtxtPtr
  1732. xmlNewParserCtxt(void)
  1733. {
  1734. xmlParserCtxtPtr ctxt;
  1735. ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
  1736. if (ctxt == NULL) {
  1737. xmlErrMemory(NULL, "cannot allocate parser context\n");
  1738. return(NULL);
  1739. }
  1740. memset(ctxt, 0, sizeof(xmlParserCtxt));
  1741. if (xmlInitParserCtxt(ctxt) < 0) {
  1742. xmlFreeParserCtxt(ctxt);
  1743. return(NULL);
  1744. }
  1745. return(ctxt);
  1746. }
  1747. /************************************************************************
  1748. * *
  1749. * Handling of node information *
  1750. * *
  1751. ************************************************************************/
  1752. /**
  1753. * xmlClearParserCtxt:
  1754. * @ctxt: an XML parser context
  1755. *
  1756. * Clear (release owned resources) and reinitialize a parser context
  1757. */
  1758. void
  1759. xmlClearParserCtxt(xmlParserCtxtPtr ctxt)
  1760. {
  1761. if (ctxt==NULL)
  1762. return;
  1763. xmlClearNodeInfoSeq(&ctxt->node_seq);
  1764. xmlCtxtReset(ctxt);
  1765. }
  1766. /**
  1767. * xmlParserFindNodeInfo:
  1768. * @ctx: an XML parser context
  1769. * @node: an XML node within the tree
  1770. *
  1771. * Find the parser node info struct for a given node
  1772. *
  1773. * Returns an xmlParserNodeInfo block pointer or NULL
  1774. */
  1775. const xmlParserNodeInfo *
  1776. xmlParserFindNodeInfo(const xmlParserCtxtPtr ctx, const xmlNodePtr node)
  1777. {
  1778. unsigned long pos;
  1779. if ((ctx == NULL) || (node == NULL))
  1780. return (NULL);
  1781. /* Find position where node should be at */
  1782. pos = xmlParserFindNodeInfoIndex(&ctx->node_seq, node);
  1783. if (pos < ctx->node_seq.length
  1784. && ctx->node_seq.buffer[pos].node == node)
  1785. return &ctx->node_seq.buffer[pos];
  1786. else
  1787. return NULL;
  1788. }
  1789. /**
  1790. * xmlInitNodeInfoSeq:
  1791. * @seq: a node info sequence pointer
  1792. *
  1793. * -- Initialize (set to initial state) node info sequence
  1794. */
  1795. void
  1796. xmlInitNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
  1797. {
  1798. if (seq == NULL)
  1799. return;
  1800. seq->length = 0;
  1801. seq->maximum = 0;
  1802. seq->buffer = NULL;
  1803. }
  1804. /**
  1805. * xmlClearNodeInfoSeq:
  1806. * @seq: a node info sequence pointer
  1807. *
  1808. * -- Clear (release memory and reinitialize) node
  1809. * info sequence
  1810. */
  1811. void
  1812. xmlClearNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
  1813. {
  1814. if (seq == NULL)
  1815. return;
  1816. if (seq->buffer != NULL)
  1817. xmlFree(seq->buffer);
  1818. xmlInitNodeInfoSeq(seq);
  1819. }
  1820. /**
  1821. * xmlParserFindNodeInfoIndex:
  1822. * @seq: a node info sequence pointer
  1823. * @node: an XML node pointer
  1824. *
  1825. *
  1826. * xmlParserFindNodeInfoIndex : Find the index that the info record for
  1827. * the given node is or should be at in a sorted sequence
  1828. *
  1829. * Returns a long indicating the position of the record
  1830. */
  1831. unsigned long
  1832. xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeqPtr seq,
  1833. const xmlNodePtr node)
  1834. {
  1835. unsigned long upper, lower, middle;
  1836. int found = 0;
  1837. if ((seq == NULL) || (node == NULL))
  1838. return ((unsigned long) -1);
  1839. /* Do a binary search for the key */
  1840. lower = 1;
  1841. upper = seq->length;
  1842. middle = 0;
  1843. while (lower <= upper && !found) {
  1844. middle = lower + (upper - lower) / 2;
  1845. if (node == seq->buffer[middle - 1].node)
  1846. found = 1;
  1847. else if (node < seq->buffer[middle - 1].node)
  1848. upper = middle - 1;
  1849. else
  1850. lower = middle + 1;
  1851. }
  1852. /* Return position */
  1853. if (middle == 0 || seq->buffer[middle - 1].node < node)
  1854. return middle;
  1855. else
  1856. return middle - 1;
  1857. }
  1858. /**
  1859. * xmlParserAddNodeInfo:
  1860. * @ctxt: an XML parser context
  1861. * @info: a node info sequence pointer
  1862. *
  1863. * Insert node info record into the sorted sequence
  1864. */
  1865. void
  1866. xmlParserAddNodeInfo(xmlParserCtxtPtr ctxt,
  1867. const xmlParserNodeInfoPtr info)
  1868. {
  1869. unsigned long pos;
  1870. if ((ctxt == NULL) || (info == NULL)) return;
  1871. /* Find pos and check to see if node is already in the sequence */
  1872. pos = xmlParserFindNodeInfoIndex(&ctxt->node_seq, (xmlNodePtr)
  1873. info->node);
  1874. if ((pos < ctxt->node_seq.length) &&
  1875. (ctxt->node_seq.buffer != NULL) &&
  1876. (ctxt->node_seq.buffer[pos].node == info->node)) {
  1877. ctxt->node_seq.buffer[pos] = *info;
  1878. }
  1879. /* Otherwise, we need to add new node to buffer */
  1880. else {
  1881. if ((ctxt->node_seq.length + 1 > ctxt->node_seq.maximum) ||
  1882. (ctxt->node_seq.buffer == NULL)) {
  1883. xmlParserNodeInfo *tmp_buffer;
  1884. unsigned int byte_size;
  1885. if (ctxt->node_seq.maximum == 0)
  1886. ctxt->node_seq.maximum = 2;
  1887. byte_size = (sizeof(*ctxt->node_seq.buffer) *
  1888. (2 * ctxt->node_seq.maximum));
  1889. if (ctxt->node_seq.buffer == NULL)
  1890. tmp_buffer = (xmlParserNodeInfo *) xmlMalloc(byte_size);
  1891. else
  1892. tmp_buffer =
  1893. (xmlParserNodeInfo *) xmlRealloc(ctxt->node_seq.buffer,
  1894. byte_size);
  1895. if (tmp_buffer == NULL) {
  1896. xmlErrMemory(ctxt, "failed to allocate buffer\n");
  1897. return;
  1898. }
  1899. ctxt->node_seq.buffer = tmp_buffer;
  1900. ctxt->node_seq.maximum *= 2;
  1901. }
  1902. /* If position is not at end, move elements out of the way */
  1903. if (pos != ctxt->node_seq.length) {
  1904. unsigned long i;
  1905. for (i = ctxt->node_seq.length; i > pos; i--)
  1906. ctxt->node_seq.buffer[i] = ctxt->node_seq.buffer[i - 1];
  1907. }
  1908. /* Copy element and increase length */
  1909. ctxt->node_seq.buffer[pos] = *info;
  1910. ctxt->node_seq.length++;
  1911. }
  1912. }
  1913. /************************************************************************
  1914. * *
  1915. * Defaults settings *
  1916. * *
  1917. ************************************************************************/
  1918. /**
  1919. * xmlPedanticParserDefault:
  1920. * @val: int 0 or 1
  1921. *
  1922. * Set and return the previous value for enabling pedantic warnings.
  1923. *
  1924. * Returns the last value for 0 for no substitution, 1 for substitution.
  1925. */
  1926. int
  1927. xmlPedanticParserDefault(int val) {
  1928. int old = xmlPedanticParserDefaultValue;
  1929. xmlPedanticParserDefaultValue = val;
  1930. return(old);
  1931. }
  1932. /**
  1933. * xmlLineNumbersDefault:
  1934. * @val: int 0 or 1
  1935. *
  1936. * Set and return the previous value for enabling line numbers in elements
  1937. * contents. This may break on old application and is turned off by default.
  1938. *
  1939. * Returns the last value for 0 for no substitution, 1 for substitution.
  1940. */
  1941. int
  1942. xmlLineNumbersDefault(int val) {
  1943. int old = xmlLineNumbersDefaultValue;
  1944. xmlLineNumbersDefaultValue = val;
  1945. return(old);
  1946. }
  1947. /**
  1948. * xmlSubstituteEntitiesDefault:
  1949. * @val: int 0 or 1
  1950. *
  1951. * Set and return the previous value for default entity support.
  1952. * Initially the parser always keep entity references instead of substituting
  1953. * entity values in the output. This function has to be used to change the
  1954. * default parser behavior
  1955. * SAX::substituteEntities() has to be used for changing that on a file by
  1956. * file basis.
  1957. *
  1958. * Returns the last value for 0 for no substitution, 1 for substitution.
  1959. */
  1960. int
  1961. xmlSubstituteEntitiesDefault(int val) {
  1962. int old = xmlSubstituteEntitiesDefaultValue;
  1963. xmlSubstituteEntitiesDefaultValue = val;
  1964. return(old);
  1965. }
  1966. /**
  1967. * xmlKeepBlanksDefault:
  1968. * @val: int 0 or 1
  1969. *
  1970. * Set and return the previous value for default blanks text nodes support.
  1971. * The 1.x version of the parser used an heuristic to try to detect
  1972. * ignorable white spaces. As a result the SAX callback was generating
  1973. * xmlSAX2IgnorableWhitespace() callbacks instead of characters() one, and when
  1974. * using the DOM output text nodes containing those blanks were not generated.
  1975. * The 2.x and later version will switch to the XML standard way and
  1976. * ignorableWhitespace() are only generated when running the parser in
  1977. * validating mode and when the current element doesn't allow CDATA or
  1978. * mixed content.
  1979. * This function is provided as a way to force the standard behavior
  1980. * on 1.X libs and to switch back to the old mode for compatibility when
  1981. * running 1.X client code on 2.X . Upgrade of 1.X code should be done
  1982. * by using xmlIsBlankNode() commodity function to detect the "empty"
  1983. * nodes generated.
  1984. * This value also affect autogeneration of indentation when saving code
  1985. * if blanks sections are kept, indentation is not generated.
  1986. *
  1987. * Returns the last value for 0 for no substitution, 1 for substitution.
  1988. */
  1989. int
  1990. xmlKeepBlanksDefault(int val) {
  1991. int old = xmlKeepBlanksDefaultValue;
  1992. xmlKeepBlanksDefaultValue = val;
  1993. if (!val) xmlIndentTreeOutput = 1;
  1994. return(old);
  1995. }
  1996. #define bottom_parserInternals
  1997. #include "elfgcchack.h"