HTMLparser.c 208 KB


  1. /*
  2. * HTMLparser.c : an HTML 4.0 non-verifying parser
  3. *
  4. * See Copyright for the status of this software.
  5. *
  6. * daniel@veillard.com
  7. */
  8. #define IN_LIBXML
  9. #include "libxml.h"
  10. #ifdef LIBXML_HTML_ENABLED
  11. #include <string.h>
  12. #ifdef HAVE_CTYPE_H
  13. #include <ctype.h>
  14. #endif
  15. #ifdef HAVE_STDLIB_H
  16. #include <stdlib.h>
  17. #endif
  18. #ifdef HAVE_SYS_STAT_H
  19. #include <sys/stat.h>
  20. #endif
  21. #ifdef HAVE_FCNTL_H
  22. #include <fcntl.h>
  23. #endif
  24. #ifdef HAVE_UNISTD_H
  25. #include <unistd.h>
  26. #endif
  27. #ifdef LIBXML_ZLIB_ENABLED
  28. #include <zlib.h>
  29. #endif
  30. #include <libxml/xmlmemory.h>
  31. #include <libxml/tree.h>
  32. #include <libxml/parser.h>
  33. #include <libxml/parserInternals.h>
  34. #include <libxml/xmlerror.h>
  35. #include <libxml/HTMLparser.h>
  36. #include <libxml/HTMLtree.h>
  37. #include <libxml/entities.h>
  38. #include <libxml/encoding.h>
  39. #include <libxml/valid.h>
  40. #include <libxml/xmlIO.h>
  41. #include <libxml/globals.h>
  42. #include <libxml/uri.h>
  43. #include "buf.h"
  44. #include "enc.h"
  45. #define HTML_MAX_NAMELEN 1000
  46. #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  47. #define HTML_PARSER_BUFFER_SIZE 100
  48. /* #define DEBUG */
  49. /* #define DEBUG_PUSH */
  50. static int htmlOmittedDefaultValue = 1;
  51. xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  52. xmlChar end, xmlChar end2, xmlChar end3);
  53. static void htmlParseComment(htmlParserCtxtPtr ctxt);
  54. /************************************************************************
  55. * *
  56. * Some factorized error routines *
  57. * *
  58. ************************************************************************/
  59. /**
  60. * htmlErrMemory:
  61. * @ctxt: an HTML parser context
  62. * @extra: extra information
  63. *
  64. * Handle a redefinition of attribute error
  65. */
  66. static void
  67. htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  68. {
  69. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  70. (ctxt->instate == XML_PARSER_EOF))
  71. return;
  72. if (ctxt != NULL) {
  73. ctxt->errNo = XML_ERR_NO_MEMORY;
  74. ctxt->instate = XML_PARSER_EOF;
  75. ctxt->disableSAX = 1;
  76. }
  77. if (extra)
  78. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  79. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  80. NULL, NULL, 0, 0,
  81. "Memory allocation failed : %s\n", extra);
  82. else
  83. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  84. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  85. NULL, NULL, 0, 0, "Memory allocation failed\n");
  86. }
  87. /**
  88. * htmlParseErr:
  89. * @ctxt: an HTML parser context
  90. * @error: the error number
  91. * @msg: the error message
  92. * @str1: string infor
  93. * @str2: string infor
  94. *
  95. * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  96. */
  97. static void LIBXML_ATTR_FORMAT(3,0)
  98. htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  99. const char *msg, const xmlChar *str1, const xmlChar *str2)
  100. {
  101. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  102. (ctxt->instate == XML_PARSER_EOF))
  103. return;
  104. if (ctxt != NULL)
  105. ctxt->errNo = error;
  106. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
  107. XML_ERR_ERROR, NULL, 0,
  108. (const char *) str1, (const char *) str2,
  109. NULL, 0, 0,
  110. msg, str1, str2);
  111. if (ctxt != NULL)
  112. ctxt->wellFormed = 0;
  113. }
  114. /**
  115. * htmlParseErrInt:
  116. * @ctxt: an HTML parser context
  117. * @error: the error number
  118. * @msg: the error message
  119. * @val: integer info
  120. *
  121. * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  122. */
  123. static void LIBXML_ATTR_FORMAT(3,0)
  124. htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  125. const char *msg, int val)
  126. {
  127. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  128. (ctxt->instate == XML_PARSER_EOF))
  129. return;
  130. if (ctxt != NULL)
  131. ctxt->errNo = error;
  132. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
  133. XML_ERR_ERROR, NULL, 0, NULL, NULL,
  134. NULL, val, 0, msg, val);
  135. if (ctxt != NULL)
  136. ctxt->wellFormed = 0;
  137. }
  138. /************************************************************************
  139. * *
  140. * Parser stacks related functions and macros *
  141. * *
  142. ************************************************************************/
  143. /**
  144. * htmlnamePush:
  145. * @ctxt: an HTML parser context
  146. * @value: the element name
  147. *
  148. * Pushes a new element name on top of the name stack
  149. *
  150. * Returns 0 in case of error, the index in the stack otherwise
  151. */
  152. static int
  153. htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
  154. {
  155. if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
  156. ctxt->html = 3;
  157. if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
  158. ctxt->html = 10;
  159. if (ctxt->nameNr >= ctxt->nameMax) {
  160. ctxt->nameMax *= 2;
  161. ctxt->nameTab = (const xmlChar * *)
  162. xmlRealloc((xmlChar * *)ctxt->nameTab,
  163. ctxt->nameMax *
  164. sizeof(ctxt->nameTab[0]));
  165. if (ctxt->nameTab == NULL) {
  166. htmlErrMemory(ctxt, NULL);
  167. return (0);
  168. }
  169. }
  170. ctxt->nameTab[ctxt->nameNr] = value;
  171. ctxt->name = value;
  172. return (ctxt->nameNr++);
  173. }
  174. /**
  175. * htmlnamePop:
  176. * @ctxt: an HTML parser context
  177. *
  178. * Pops the top element name from the name stack
  179. *
  180. * Returns the name just removed
  181. */
  182. static const xmlChar *
  183. htmlnamePop(htmlParserCtxtPtr ctxt)
  184. {
  185. const xmlChar *ret;
  186. if (ctxt->nameNr <= 0)
  187. return (NULL);
  188. ctxt->nameNr--;
  189. if (ctxt->nameNr < 0)
  190. return (NULL);
  191. if (ctxt->nameNr > 0)
  192. ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
  193. else
  194. ctxt->name = NULL;
  195. ret = ctxt->nameTab[ctxt->nameNr];
  196. ctxt->nameTab[ctxt->nameNr] = NULL;
  197. return (ret);
  198. }
  199. /**
  200. * htmlNodeInfoPush:
  201. * @ctxt: an HTML parser context
  202. * @value: the node info
  203. *
  204. * Pushes a new element name on top of the node info stack
  205. *
  206. * Returns 0 in case of error, the index in the stack otherwise
  207. */
  208. static int
  209. htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
  210. {
  211. if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
  212. if (ctxt->nodeInfoMax == 0)
  213. ctxt->nodeInfoMax = 5;
  214. ctxt->nodeInfoMax *= 2;
  215. ctxt->nodeInfoTab = (htmlParserNodeInfo *)
  216. xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
  217. ctxt->nodeInfoMax *
  218. sizeof(ctxt->nodeInfoTab[0]));
  219. if (ctxt->nodeInfoTab == NULL) {
  220. htmlErrMemory(ctxt, NULL);
  221. return (0);
  222. }
  223. }
  224. ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
  225. ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
  226. return (ctxt->nodeInfoNr++);
  227. }
  228. /**
  229. * htmlNodeInfoPop:
  230. * @ctxt: an HTML parser context
  231. *
  232. * Pops the top element name from the node info stack
  233. *
  234. * Returns 0 in case of error, the pointer to NodeInfo otherwise
  235. */
  236. static htmlParserNodeInfo *
  237. htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
  238. {
  239. if (ctxt->nodeInfoNr <= 0)
  240. return (NULL);
  241. ctxt->nodeInfoNr--;
  242. if (ctxt->nodeInfoNr < 0)
  243. return (NULL);
  244. if (ctxt->nodeInfoNr > 0)
  245. ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
  246. else
  247. ctxt->nodeInfo = NULL;
  248. return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
  249. }
  250. /*
  251. * Macros for accessing the content. Those should be used only by the parser,
  252. * and not exported.
  253. *
  254. * Dirty macros, i.e. one need to make assumption on the context to use them
  255. *
  256. * CUR_PTR return the current pointer to the xmlChar to be parsed.
  257. * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
  258. * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
  259. * in UNICODE mode. This should be used internally by the parser
  260. * only to compare to ASCII values otherwise it would break when
  261. * running with UTF-8 encoding.
  262. * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
  263. * to compare on ASCII based substring.
  264. * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
  265. * it should be used only to compare on ASCII based substring.
  266. * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
  267. * strings without newlines within the parser.
  268. *
  269. * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
  270. *
  271. * CURRENT Returns the current char value, with the full decoding of
  272. * UTF-8 if we are using this mode. It returns an int.
  273. * NEXT Skip to the next character, this does the proper decoding
  274. * in UTF-8 mode. It also pop-up unfinished entities on the fly.
  275. * NEXTL(l) Skip the current unicode character of l xmlChars long.
  276. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
  277. */
  278. #define UPPER (toupper(*ctxt->input->cur))
  279. #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
  280. #define NXT(val) ctxt->input->cur[(val)]
  281. #define UPP(val) (toupper(ctxt->input->cur[(val)]))
  282. #define CUR_PTR ctxt->input->cur
  283. #define BASE_PTR ctxt->input->base
  284. #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
  285. (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
  286. xmlParserInputShrink(ctxt->input)
  287. #define GROW if ((ctxt->progressive == 0) && \
  288. (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
  289. xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
  290. #define CURRENT ((int) (*ctxt->input->cur))
  291. #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
  292. /* Imported from XML */
  293. /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
  294. #define CUR ((int) (*ctxt->input->cur))
  295. #define NEXT xmlNextChar(ctxt)
  296. #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
  297. #define NEXTL(l) do { \
  298. if (*(ctxt->input->cur) == '\n') { \
  299. ctxt->input->line++; ctxt->input->col = 1; \
  300. } else ctxt->input->col++; \
  301. ctxt->token = 0; ctxt->input->cur += l; \
  302. } while (0)
  303. /************
  304. \
  305. if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
  306. if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
  307. ************/
  308. #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
  309. #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
  310. #define COPY_BUF(l,b,i,v) \
  311. if (l == 1) b[i++] = (xmlChar) v; \
  312. else i += xmlCopyChar(l,&b[i],v)
  313. /**
  314. * htmlFindEncoding:
  315. * @the HTML parser context
  316. *
  317. * Ty to find and encoding in the current data available in the input
  318. * buffer this is needed to try to switch to the proper encoding when
  319. * one face a character error.
  320. * That's an heuristic, since it's operating outside of parsing it could
  321. * try to use a meta which had been commented out, that's the reason it
  322. * should only be used in case of error, not as a default.
  323. *
  324. * Returns an encoding string or NULL if not found, the string need to
  325. * be freed
  326. */
  327. static xmlChar *
  328. htmlFindEncoding(xmlParserCtxtPtr ctxt) {
  329. const xmlChar *start, *cur, *end;
  330. if ((ctxt == NULL) || (ctxt->input == NULL) ||
  331. (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
  332. (ctxt->input->buf->encoder != NULL))
  333. return(NULL);
  334. if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
  335. return(NULL);
  336. start = ctxt->input->cur;
  337. end = ctxt->input->end;
  338. /* we also expect the input buffer to be zero terminated */
  339. if (*end != 0)
  340. return(NULL);
  341. cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
  342. if (cur == NULL)
  343. return(NULL);
  344. cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
  345. if (cur == NULL)
  346. return(NULL);
  347. cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
  348. if (cur == NULL)
  349. return(NULL);
  350. cur += 8;
  351. start = cur;
  352. while (((*cur >= 'A') && (*cur <= 'Z')) ||
  353. ((*cur >= 'a') && (*cur <= 'z')) ||
  354. ((*cur >= '0') && (*cur <= '9')) ||
  355. (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
  356. cur++;
  357. if (cur == start)
  358. return(NULL);
  359. return(xmlStrndup(start, cur - start));
  360. }
  361. /**
  362. * htmlCurrentChar:
  363. * @ctxt: the HTML parser context
  364. * @len: pointer to the length of the char read
  365. *
  366. * The current char value, if using UTF-8 this may actually span multiple
  367. * bytes in the input buffer. Implement the end of line normalization:
  368. * 2.11 End-of-Line Handling
  369. * If the encoding is unspecified, in the case we find an ISO-Latin-1
  370. * char, then the encoding converter is plugged in automatically.
  371. *
  372. * Returns the current char value and its length
  373. */
  374. static int
  375. htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
  376. const unsigned char *cur;
  377. unsigned char c;
  378. unsigned int val;
  379. if (ctxt->instate == XML_PARSER_EOF)
  380. return(0);
  381. if (ctxt->token != 0) {
  382. *len = 0;
  383. return(ctxt->token);
  384. }
  385. if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
  386. xmlChar * guess;
  387. xmlCharEncodingHandlerPtr handler;
  388. /*
  389. * Assume it's a fixed length encoding (1) with
  390. * a compatible encoding for the ASCII set, since
  391. * HTML constructs only use < 128 chars
  392. */
  393. if ((int) *ctxt->input->cur < 0x80) {
  394. *len = 1;
  395. if ((*ctxt->input->cur == 0) &&
  396. (ctxt->input->cur < ctxt->input->end)) {
  397. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  398. "Char 0x%X out of allowed range\n", 0);
  399. return(' ');
  400. }
  401. return((int) *ctxt->input->cur);
  402. }
  403. /*
  404. * Humm this is bad, do an automatic flow conversion
  405. */
  406. guess = htmlFindEncoding(ctxt);
  407. if (guess == NULL) {
  408. xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
  409. } else {
  410. if (ctxt->input->encoding != NULL)
  411. xmlFree((xmlChar *) ctxt->input->encoding);
  412. ctxt->input->encoding = guess;
  413. handler = xmlFindCharEncodingHandler((const char *) guess);
  414. if (handler != NULL) {
  415. /*
  416. * Don't use UTF-8 encoder which isn't required and
  417. * can produce invalid UTF-8.
  418. */
  419. if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
  420. xmlSwitchToEncoding(ctxt, handler);
  421. } else {
  422. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  423. "Unsupported encoding %s", guess, NULL);
  424. }
  425. }
  426. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  427. }
  428. /*
  429. * We are supposed to handle UTF8, check it's valid
  430. * From rfc2044: encoding of the Unicode values on UTF-8:
  431. *
  432. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  433. * 0000 0000-0000 007F 0xxxxxxx
  434. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  435. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  436. *
  437. * Check for the 0x110000 limit too
  438. */
  439. cur = ctxt->input->cur;
  440. c = *cur;
  441. if (c & 0x80) {
  442. if ((c & 0x40) == 0)
  443. goto encoding_error;
  444. if (cur[1] == 0) {
  445. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  446. cur = ctxt->input->cur;
  447. }
  448. if ((cur[1] & 0xc0) != 0x80)
  449. goto encoding_error;
  450. if ((c & 0xe0) == 0xe0) {
  451. if (cur[2] == 0) {
  452. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  453. cur = ctxt->input->cur;
  454. }
  455. if ((cur[2] & 0xc0) != 0x80)
  456. goto encoding_error;
  457. if ((c & 0xf0) == 0xf0) {
  458. if (cur[3] == 0) {
  459. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  460. cur = ctxt->input->cur;
  461. }
  462. if (((c & 0xf8) != 0xf0) ||
  463. ((cur[3] & 0xc0) != 0x80))
  464. goto encoding_error;
  465. /* 4-byte code */
  466. *len = 4;
  467. val = (cur[0] & 0x7) << 18;
  468. val |= (cur[1] & 0x3f) << 12;
  469. val |= (cur[2] & 0x3f) << 6;
  470. val |= cur[3] & 0x3f;
  471. if (val < 0x10000)
  472. goto encoding_error;
  473. } else {
  474. /* 3-byte code */
  475. *len = 3;
  476. val = (cur[0] & 0xf) << 12;
  477. val |= (cur[1] & 0x3f) << 6;
  478. val |= cur[2] & 0x3f;
  479. if (val < 0x800)
  480. goto encoding_error;
  481. }
  482. } else {
  483. /* 2-byte code */
  484. *len = 2;
  485. val = (cur[0] & 0x1f) << 6;
  486. val |= cur[1] & 0x3f;
  487. if (val < 0x80)
  488. goto encoding_error;
  489. }
  490. if (!IS_CHAR(val)) {
  491. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  492. "Char 0x%X out of allowed range\n", val);
  493. }
  494. return(val);
  495. } else {
  496. if ((*ctxt->input->cur == 0) &&
  497. (ctxt->input->cur < ctxt->input->end)) {
  498. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  499. "Char 0x%X out of allowed range\n", 0);
  500. *len = 1;
  501. return(' ');
  502. }
  503. /* 1-byte code */
  504. *len = 1;
  505. return((int) *ctxt->input->cur);
  506. }
  507. encoding_error:
  508. /*
  509. * If we detect an UTF8 error that probably mean that the
  510. * input encoding didn't get properly advertised in the
  511. * declaration header. Report the error and switch the encoding
  512. * to ISO-Latin-1 (if you don't like this policy, just declare the
  513. * encoding !)
  514. */
  515. {
  516. char buffer[150];
  517. if (ctxt->input->end - ctxt->input->cur >= 4) {
  518. snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  519. ctxt->input->cur[0], ctxt->input->cur[1],
  520. ctxt->input->cur[2], ctxt->input->cur[3]);
  521. } else {
  522. snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
  523. }
  524. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  525. "Input is not proper UTF-8, indicate encoding !\n",
  526. BAD_CAST buffer, NULL);
  527. }
  528. /*
  529. * Don't switch encodings twice. Note that if there's an encoder, we
  530. * shouldn't receive invalid UTF-8 anyway.
  531. *
  532. * Note that if ctxt->input->buf == NULL, switching encodings is
  533. * impossible, see Gitlab issue #34.
  534. */
  535. if ((ctxt->input->buf != NULL) &&
  536. (ctxt->input->buf->encoder == NULL))
  537. xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
  538. *len = 1;
  539. return((int) *ctxt->input->cur);
  540. }
  541. /**
  542. * htmlSkipBlankChars:
  543. * @ctxt: the HTML parser context
  544. *
  545. * skip all blanks character found at that point in the input streams.
  546. *
  547. * Returns the number of space chars skipped
  548. */
  549. static int
  550. htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
  551. int res = 0;
  552. while (IS_BLANK_CH(*(ctxt->input->cur))) {
  553. if ((*ctxt->input->cur == 0) &&
  554. (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
  555. xmlPopInput(ctxt);
  556. } else {
  557. if (*(ctxt->input->cur) == '\n') {
  558. ctxt->input->line++; ctxt->input->col = 1;
  559. } else ctxt->input->col++;
  560. ctxt->input->cur++;
  561. if (*ctxt->input->cur == 0)
  562. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  563. }
  564. res++;
  565. }
  566. return(res);
  567. }
  568. /************************************************************************
  569. * *
  570. * The list of HTML elements and their properties *
  571. * *
  572. ************************************************************************/
  573. /*
  574. * Start Tag: 1 means the start tag can be omitted
  575. * End Tag: 1 means the end tag can be omitted
  576. * 2 means it's forbidden (empty elements)
  577. * 3 means the tag is stylistic and should be closed easily
  578. * Depr: this element is deprecated
  579. * DTD: 1 means that this element is valid only in the Loose DTD
  580. * 2 means that this element is valid only in the Frameset DTD
  581. *
  582. * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
  583. , subElements , impliedsubelt , Attributes, userdata
  584. */
  585. /* Definitions and a couple of vars for HTML Elements */
  586. #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
  587. #define NB_FONTSTYLE 8
  588. #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
  589. #define NB_PHRASE 10
  590. #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
  591. #define NB_SPECIAL 16
  592. #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
  593. #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
  594. #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
  595. #define NB_BLOCK NB_HEADING + NB_LIST + 14
  596. #define FORMCTRL "input", "select", "textarea", "label", "button"
  597. #define NB_FORMCTRL 5
  598. #define PCDATA
  599. #define NB_PCDATA 0
  600. #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
  601. #define NB_HEADING 6
  602. #define LIST "ul", "ol", "dir", "menu"
  603. #define NB_LIST 4
  604. #define MODIFIER
  605. #define NB_MODIFIER 0
  606. #define FLOW BLOCK,INLINE
  607. #define NB_FLOW NB_BLOCK + NB_INLINE
  608. #define EMPTY NULL
  609. static const char* const html_flow[] = { FLOW, NULL } ;
  610. static const char* const html_inline[] = { INLINE, NULL } ;
  611. /* placeholders: elts with content but no subelements */
  612. static const char* const html_pcdata[] = { NULL } ;
  613. #define html_cdata html_pcdata
  614. /* ... and for HTML Attributes */
  615. #define COREATTRS "id", "class", "style", "title"
  616. #define NB_COREATTRS 4
  617. #define I18N "lang", "dir"
  618. #define NB_I18N 2
  619. #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
  620. #define NB_EVENTS 9
  621. #define ATTRS COREATTRS,I18N,EVENTS
  622. #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
  623. #define CELLHALIGN "align", "char", "charoff"
  624. #define NB_CELLHALIGN 3
  625. #define CELLVALIGN "valign"
  626. #define NB_CELLVALIGN 1
  627. static const char* const html_attrs[] = { ATTRS, NULL } ;
  628. static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
  629. static const char* const core_attrs[] = { COREATTRS, NULL } ;
  630. static const char* const i18n_attrs[] = { I18N, NULL } ;
  631. /* Other declarations that should go inline ... */
  632. static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
  633. "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
  634. "tabindex", "onfocus", "onblur", NULL } ;
  635. static const char* const target_attr[] = { "target", NULL } ;
  636. static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
  637. static const char* const alt_attr[] = { "alt", NULL } ;
  638. static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
  639. static const char* const href_attrs[] = { "href", NULL } ;
  640. static const char* const clear_attrs[] = { "clear", NULL } ;
  641. static const char* const inline_p[] = { INLINE, "p", NULL } ;
  642. static const char* const flow_param[] = { FLOW, "param", NULL } ;
  643. static const char* const applet_attrs[] = { COREATTRS , "codebase",
  644. "archive", "alt", "name", "height", "width", "align",
  645. "hspace", "vspace", NULL } ;
  646. static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
  647. "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
  648. static const char* const basefont_attrs[] =
  649. { "id", "size", "color", "face", NULL } ;
  650. static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
  651. static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
  652. static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
  653. static const char* const body_depr[] = { "background", "bgcolor", "text",
  654. "link", "vlink", "alink", NULL } ;
  655. static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
  656. "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
  657. static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
  658. static const char* const col_elt[] = { "col", NULL } ;
  659. static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
  660. static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
  661. static const char* const dl_contents[] = { "dt", "dd", NULL } ;
  662. static const char* const compact_attr[] = { "compact", NULL } ;
  663. static const char* const label_attr[] = { "label", NULL } ;
  664. static const char* const fieldset_contents[] = { FLOW, "legend" } ;
  665. static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
  666. static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
  667. static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
  668. static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
  669. static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
  670. static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
  671. static const char* const head_attrs[] = { I18N, "profile", NULL } ;
  672. static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
  673. static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
  674. static const char* const version_attr[] = { "version", NULL } ;
  675. static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
  676. static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
  677. static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
  678. static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
  679. static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
  680. static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
  681. static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
  682. static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
  683. static const char* const align_attr[] = { "align", NULL } ;
  684. static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
  685. static const char* const map_contents[] = { BLOCK, "area", NULL } ;
  686. static const char* const name_attr[] = { "name", NULL } ;
  687. static const char* const action_attr[] = { "action", NULL } ;
  688. static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
  689. static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
  690. static const char* const content_attr[] = { "content", NULL } ;
  691. static const char* const type_attr[] = { "type", NULL } ;
  692. static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
  693. static const char* const object_contents[] = { FLOW, "param", NULL } ;
  694. static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
  695. static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
  696. static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
  697. static const char* const option_elt[] = { "option", NULL } ;
  698. static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
  699. static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
  700. static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
  701. static const char* const width_attr[] = { "width", NULL } ;
  702. static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
  703. static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
  704. static const char* const language_attr[] = { "language", NULL } ;
  705. static const char* const select_content[] = { "optgroup", "option", NULL } ;
  706. static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
  707. static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
  708. static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
  709. static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
  710. static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
  711. static const char* const tr_elt[] = { "tr", NULL } ;
  712. static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
  713. static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
  714. static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
  715. static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
  716. static const char* const tr_contents[] = { "th", "td", NULL } ;
  717. static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
  718. static const char* const li_elt[] = { "li", NULL } ;
  719. static const char* const ul_depr[] = { "type", "compact", NULL} ;
  720. static const char* const dir_attr[] = { "dir", NULL} ;
  721. #define DECL (const char**)
  722. static const htmlElemDesc
  723. html40ElementTable[] = {
  724. { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
  725. DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
  726. },
  727. { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
  728. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  729. },
  730. { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
  731. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  732. },
  733. { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
  734. DECL inline_p , NULL , DECL html_attrs, NULL, NULL
  735. },
  736. { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
  737. DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
  738. },
  739. { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
  740. EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
  741. },
  742. { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
  743. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  744. },
  745. { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
  746. EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
  747. },
  748. { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
  749. EMPTY , NULL , NULL, DECL basefont_attrs, NULL
  750. },
  751. { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
  752. DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
  753. },
  754. { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
  755. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  756. },
  757. { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
  758. DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
  759. },
  760. { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
  761. DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
  762. },
  763. { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
  764. EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
  765. },
  766. { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
  767. DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
  768. },
  769. { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
  770. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  771. },
  772. { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
  773. DECL html_flow , NULL , NULL, DECL html_attrs, NULL
  774. },
  775. { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
  776. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  777. },
  778. { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
  779. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  780. },
  781. { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
  782. EMPTY , NULL , DECL col_attrs , NULL, NULL
  783. },
  784. { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
  785. DECL col_elt , "col" , DECL col_attrs , NULL, NULL
  786. },
  787. { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
  788. DECL html_flow , NULL , DECL html_attrs, NULL, NULL
  789. },
  790. { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
  791. DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
  792. },
  793. { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
  794. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  795. },
  796. { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
  797. DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
  798. },
  799. { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
  800. DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
  801. },
  802. { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
  803. DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
  804. },
  805. { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
  806. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  807. },
  808. { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
  809. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  810. },
  811. { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
  812. EMPTY, NULL, DECL embed_attrs, NULL, NULL
  813. },
  814. { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
  815. DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
  816. },
  817. { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
  818. DECL html_inline, NULL, NULL, DECL font_attrs, NULL
  819. },
  820. { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
  821. DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
  822. },
  823. { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
  824. EMPTY, NULL, NULL, DECL frame_attrs, NULL
  825. },
  826. { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
  827. DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
  828. },
  829. { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
  830. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  831. },
  832. { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
  833. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  834. },
  835. { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
  836. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  837. },
  838. { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
  839. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  840. },
  841. { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
  842. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  843. },
  844. { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
  845. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  846. },
  847. { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
  848. DECL head_contents, NULL, DECL head_attrs, NULL, NULL
  849. },
  850. { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
  851. EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
  852. },
  853. { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
  854. DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
  855. },
  856. { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
  857. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  858. },
  859. { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
  860. DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
  861. },
  862. { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
  863. EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
  864. },
  865. { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
  866. EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
  867. },
  868. { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
  869. DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
  870. },
  871. { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
  872. EMPTY, NULL, NULL, DECL prompt_attrs, NULL
  873. },
  874. { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
  875. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  876. },
  877. { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
  878. DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
  879. },
  880. { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
  881. DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
  882. },
  883. { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
  884. DECL html_flow, NULL, DECL html_attrs, NULL, NULL
  885. },
  886. { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
  887. EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
  888. },
  889. { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
  890. DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
  891. },
  892. { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
  893. DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
  894. },
  895. { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
  896. EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
  897. },
  898. { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
  899. DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
  900. },
  901. { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
  902. DECL html_flow, "div", DECL html_attrs, NULL, NULL
  903. },
  904. { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
  905. DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
  906. },
  907. { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
  908. DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
  909. },
  910. { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
  911. DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
  912. },
  913. { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
  914. DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
  915. },
  916. { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
  917. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  918. },
  919. { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
  920. EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
  921. },
  922. { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
  923. DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
  924. },
  925. { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
  926. DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
  927. },
  928. { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
  929. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  930. },
  931. { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
  932. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  933. },
  934. { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
  935. DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
  936. },
  937. { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
  938. DECL select_content, NULL, DECL select_attrs, NULL, NULL
  939. },
  940. { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
  941. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  942. },
  943. { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
  944. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  945. },
  946. { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
  947. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  948. },
  949. { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
  950. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  951. },
  952. { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
  953. DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
  954. },
  955. { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
  956. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  957. },
  958. { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
  959. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  960. },
  961. { "table", 0, 0, 0, 0, 0, 0, 0, "",
  962. DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
  963. },
  964. { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
  965. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  966. },
  967. { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
  968. DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
  969. },
  970. { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
  971. DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
  972. },
  973. { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
  974. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  975. },
  976. { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
  977. DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
  978. },
  979. { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
  980. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  981. },
  982. { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
  983. DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
  984. },
  985. { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
  986. DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
  987. },
  988. { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
  989. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  990. },
  991. { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
  992. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  993. },
  994. { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
  995. DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
  996. },
  997. { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
  998. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  999. }
  1000. };
  1001. typedef struct {
  1002. const char *oldTag;
  1003. const char *newTag;
  1004. } htmlStartCloseEntry;
  1005. /*
  1006. * start tags that imply the end of current element
  1007. */
  1008. static const htmlStartCloseEntry htmlStartClose[] = {
  1009. { "a", "a" },
  1010. { "a", "fieldset" },
  1011. { "a", "table" },
  1012. { "a", "td" },
  1013. { "a", "th" },
  1014. { "address", "dd" },
  1015. { "address", "dl" },
  1016. { "address", "dt" },
  1017. { "address", "form" },
  1018. { "address", "li" },
  1019. { "address", "ul" },
  1020. { "b", "center" },
  1021. { "b", "p" },
  1022. { "b", "td" },
  1023. { "b", "th" },
  1024. { "big", "p" },
  1025. { "caption", "col" },
  1026. { "caption", "colgroup" },
  1027. { "caption", "tbody" },
  1028. { "caption", "tfoot" },
  1029. { "caption", "thead" },
  1030. { "caption", "tr" },
  1031. { "col", "col" },
  1032. { "col", "colgroup" },
  1033. { "col", "tbody" },
  1034. { "col", "tfoot" },
  1035. { "col", "thead" },
  1036. { "col", "tr" },
  1037. { "colgroup", "colgroup" },
  1038. { "colgroup", "tbody" },
  1039. { "colgroup", "tfoot" },
  1040. { "colgroup", "thead" },
  1041. { "colgroup", "tr" },
  1042. { "dd", "dt" },
  1043. { "dir", "dd" },
  1044. { "dir", "dl" },
  1045. { "dir", "dt" },
  1046. { "dir", "form" },
  1047. { "dir", "ul" },
  1048. { "dl", "form" },
  1049. { "dl", "li" },
  1050. { "dt", "dd" },
  1051. { "dt", "dl" },
  1052. { "font", "center" },
  1053. { "font", "td" },
  1054. { "font", "th" },
  1055. { "form", "form" },
  1056. { "h1", "fieldset" },
  1057. { "h1", "form" },
  1058. { "h1", "li" },
  1059. { "h1", "p" },
  1060. { "h1", "table" },
  1061. { "h2", "fieldset" },
  1062. { "h2", "form" },
  1063. { "h2", "li" },
  1064. { "h2", "p" },
  1065. { "h2", "table" },
  1066. { "h3", "fieldset" },
  1067. { "h3", "form" },
  1068. { "h3", "li" },
  1069. { "h3", "p" },
  1070. { "h3", "table" },
  1071. { "h4", "fieldset" },
  1072. { "h4", "form" },
  1073. { "h4", "li" },
  1074. { "h4", "p" },
  1075. { "h4", "table" },
  1076. { "h5", "fieldset" },
  1077. { "h5", "form" },
  1078. { "h5", "li" },
  1079. { "h5", "p" },
  1080. { "h5", "table" },
  1081. { "h6", "fieldset" },
  1082. { "h6", "form" },
  1083. { "h6", "li" },
  1084. { "h6", "p" },
  1085. { "h6", "table" },
  1086. { "head", "a" },
  1087. { "head", "abbr" },
  1088. { "head", "acronym" },
  1089. { "head", "address" },
  1090. { "head", "b" },
  1091. { "head", "bdo" },
  1092. { "head", "big" },
  1093. { "head", "blockquote" },
  1094. { "head", "body" },
  1095. { "head", "br" },
  1096. { "head", "center" },
  1097. { "head", "cite" },
  1098. { "head", "code" },
  1099. { "head", "dd" },
  1100. { "head", "dfn" },
  1101. { "head", "dir" },
  1102. { "head", "div" },
  1103. { "head", "dl" },
  1104. { "head", "dt" },
  1105. { "head", "em" },
  1106. { "head", "fieldset" },
  1107. { "head", "font" },
  1108. { "head", "form" },
  1109. { "head", "frameset" },
  1110. { "head", "h1" },
  1111. { "head", "h2" },
  1112. { "head", "h3" },
  1113. { "head", "h4" },
  1114. { "head", "h5" },
  1115. { "head", "h6" },
  1116. { "head", "hr" },
  1117. { "head", "i" },
  1118. { "head", "iframe" },
  1119. { "head", "img" },
  1120. { "head", "kbd" },
  1121. { "head", "li" },
  1122. { "head", "listing" },
  1123. { "head", "map" },
  1124. { "head", "menu" },
  1125. { "head", "ol" },
  1126. { "head", "p" },
  1127. { "head", "pre" },
  1128. { "head", "q" },
  1129. { "head", "s" },
  1130. { "head", "samp" },
  1131. { "head", "small" },
  1132. { "head", "span" },
  1133. { "head", "strike" },
  1134. { "head", "strong" },
  1135. { "head", "sub" },
  1136. { "head", "sup" },
  1137. { "head", "table" },
  1138. { "head", "tt" },
  1139. { "head", "u" },
  1140. { "head", "ul" },
  1141. { "head", "var" },
  1142. { "head", "xmp" },
  1143. { "hr", "form" },
  1144. { "i", "center" },
  1145. { "i", "p" },
  1146. { "i", "td" },
  1147. { "i", "th" },
  1148. { "legend", "fieldset" },
  1149. { "li", "li" },
  1150. { "link", "body" },
  1151. { "link", "frameset" },
  1152. { "listing", "dd" },
  1153. { "listing", "dl" },
  1154. { "listing", "dt" },
  1155. { "listing", "fieldset" },
  1156. { "listing", "form" },
  1157. { "listing", "li" },
  1158. { "listing", "table" },
  1159. { "listing", "ul" },
  1160. { "menu", "dd" },
  1161. { "menu", "dl" },
  1162. { "menu", "dt" },
  1163. { "menu", "form" },
  1164. { "menu", "ul" },
  1165. { "ol", "form" },
  1166. { "ol", "ul" },
  1167. { "option", "optgroup" },
  1168. { "option", "option" },
  1169. { "p", "address" },
  1170. { "p", "blockquote" },
  1171. { "p", "body" },
  1172. { "p", "caption" },
  1173. { "p", "center" },
  1174. { "p", "col" },
  1175. { "p", "colgroup" },
  1176. { "p", "dd" },
  1177. { "p", "dir" },
  1178. { "p", "div" },
  1179. { "p", "dl" },
  1180. { "p", "dt" },
  1181. { "p", "fieldset" },
  1182. { "p", "form" },
  1183. { "p", "frameset" },
  1184. { "p", "h1" },
  1185. { "p", "h2" },
  1186. { "p", "h3" },
  1187. { "p", "h4" },
  1188. { "p", "h5" },
  1189. { "p", "h6" },
  1190. { "p", "head" },
  1191. { "p", "hr" },
  1192. { "p", "li" },
  1193. { "p", "listing" },
  1194. { "p", "menu" },
  1195. { "p", "ol" },
  1196. { "p", "p" },
  1197. { "p", "pre" },
  1198. { "p", "table" },
  1199. { "p", "tbody" },
  1200. { "p", "td" },
  1201. { "p", "tfoot" },
  1202. { "p", "th" },
  1203. { "p", "title" },
  1204. { "p", "tr" },
  1205. { "p", "ul" },
  1206. { "p", "xmp" },
  1207. { "pre", "dd" },
  1208. { "pre", "dl" },
  1209. { "pre", "dt" },
  1210. { "pre", "fieldset" },
  1211. { "pre", "form" },
  1212. { "pre", "li" },
  1213. { "pre", "table" },
  1214. { "pre", "ul" },
  1215. { "s", "p" },
  1216. { "script", "noscript" },
  1217. { "small", "p" },
  1218. { "span", "td" },
  1219. { "span", "th" },
  1220. { "strike", "p" },
  1221. { "style", "body" },
  1222. { "style", "frameset" },
  1223. { "tbody", "tbody" },
  1224. { "tbody", "tfoot" },
  1225. { "td", "tbody" },
  1226. { "td", "td" },
  1227. { "td", "tfoot" },
  1228. { "td", "th" },
  1229. { "td", "tr" },
  1230. { "tfoot", "tbody" },
  1231. { "th", "tbody" },
  1232. { "th", "td" },
  1233. { "th", "tfoot" },
  1234. { "th", "th" },
  1235. { "th", "tr" },
  1236. { "thead", "tbody" },
  1237. { "thead", "tfoot" },
  1238. { "title", "body" },
  1239. { "title", "frameset" },
  1240. { "tr", "tbody" },
  1241. { "tr", "tfoot" },
  1242. { "tr", "tr" },
  1243. { "tt", "p" },
  1244. { "u", "p" },
  1245. { "u", "td" },
  1246. { "u", "th" },
  1247. { "ul", "address" },
  1248. { "ul", "form" },
  1249. { "ul", "menu" },
  1250. { "ul", "ol" },
  1251. { "ul", "pre" },
  1252. { "xmp", "dd" },
  1253. { "xmp", "dl" },
  1254. { "xmp", "dt" },
  1255. { "xmp", "fieldset" },
  1256. { "xmp", "form" },
  1257. { "xmp", "li" },
  1258. { "xmp", "table" },
  1259. { "xmp", "ul" }
  1260. };
  1261. /*
  1262. * The list of HTML elements which are supposed not to have
  1263. * CDATA content and where a p element will be implied
  1264. *
  1265. * TODO: extend that list by reading the HTML SGML DTD on
  1266. * implied paragraph
  1267. */
  1268. static const char *const htmlNoContentElements[] = {
  1269. "html",
  1270. "head",
  1271. NULL
  1272. };
  1273. /*
  1274. * The list of HTML attributes which are of content %Script;
  1275. * NOTE: when adding ones, check htmlIsScriptAttribute() since
  1276. * it assumes the name starts with 'on'
  1277. */
  1278. static const char *const htmlScriptAttributes[] = {
  1279. "onclick",
  1280. "ondblclick",
  1281. "onmousedown",
  1282. "onmouseup",
  1283. "onmouseover",
  1284. "onmousemove",
  1285. "onmouseout",
  1286. "onkeypress",
  1287. "onkeydown",
  1288. "onkeyup",
  1289. "onload",
  1290. "onunload",
  1291. "onfocus",
  1292. "onblur",
  1293. "onsubmit",
  1294. "onreset",
  1295. "onchange",
  1296. "onselect"
  1297. };
  1298. /*
  1299. * This table is used by the htmlparser to know what to do with
  1300. * broken html pages. By assigning different priorities to different
  1301. * elements the parser can decide how to handle extra endtags.
  1302. * Endtags are only allowed to close elements with lower or equal
  1303. * priority.
  1304. */
  1305. typedef struct {
  1306. const char *name;
  1307. int priority;
  1308. } elementPriority;
  1309. static const elementPriority htmlEndPriority[] = {
  1310. {"div", 150},
  1311. {"td", 160},
  1312. {"th", 160},
  1313. {"tr", 170},
  1314. {"thead", 180},
  1315. {"tbody", 180},
  1316. {"tfoot", 180},
  1317. {"table", 190},
  1318. {"head", 200},
  1319. {"body", 200},
  1320. {"html", 220},
  1321. {NULL, 100} /* Default priority */
  1322. };
  1323. /************************************************************************
  1324. * *
  1325. * functions to handle HTML specific data *
  1326. * *
  1327. ************************************************************************/
  1328. /**
  1329. * htmlInitAutoClose:
  1330. *
  1331. * This is a no-op now.
  1332. */
  1333. void
  1334. htmlInitAutoClose(void) {
  1335. }
  1336. static int
  1337. htmlCompareTags(const void *key, const void *member) {
  1338. const xmlChar *tag = (const xmlChar *) key;
  1339. const htmlElemDesc *desc = (const htmlElemDesc *) member;
  1340. return(xmlStrcasecmp(tag, BAD_CAST desc->name));
  1341. }
  1342. /**
  1343. * htmlTagLookup:
  1344. * @tag: The tag name in lowercase
  1345. *
  1346. * Lookup the HTML tag in the ElementTable
  1347. *
  1348. * Returns the related htmlElemDescPtr or NULL if not found.
  1349. */
  1350. const htmlElemDesc *
  1351. htmlTagLookup(const xmlChar *tag) {
  1352. if (tag == NULL)
  1353. return(NULL);
  1354. return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
  1355. sizeof(html40ElementTable) / sizeof(htmlElemDesc),
  1356. sizeof(htmlElemDesc), htmlCompareTags));
  1357. }
  1358. /**
  1359. * htmlGetEndPriority:
  1360. * @name: The name of the element to look up the priority for.
  1361. *
  1362. * Return value: The "endtag" priority.
  1363. **/
  1364. static int
  1365. htmlGetEndPriority (const xmlChar *name) {
  1366. int i = 0;
  1367. while ((htmlEndPriority[i].name != NULL) &&
  1368. (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
  1369. i++;
  1370. return(htmlEndPriority[i].priority);
  1371. }
  1372. static int
  1373. htmlCompareStartClose(const void *vkey, const void *member) {
  1374. const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
  1375. const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
  1376. int ret;
  1377. ret = strcmp(key->oldTag, entry->oldTag);
  1378. if (ret == 0)
  1379. ret = strcmp(key->newTag, entry->newTag);
  1380. return(ret);
  1381. }
  1382. /**
  1383. * htmlCheckAutoClose:
  1384. * @newtag: The new tag name
  1385. * @oldtag: The old tag name
  1386. *
  1387. * Checks whether the new tag is one of the registered valid tags for
  1388. * closing old.
  1389. *
  1390. * Returns 0 if no, 1 if yes.
  1391. */
  1392. static int
  1393. htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
  1394. {
  1395. htmlStartCloseEntry key;
  1396. void *res;
  1397. key.oldTag = (const char *) oldtag;
  1398. key.newTag = (const char *) newtag;
  1399. res = bsearch(&key, htmlStartClose,
  1400. sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
  1401. sizeof(htmlStartCloseEntry), htmlCompareStartClose);
  1402. return(res != NULL);
  1403. }
  1404. /**
  1405. * htmlAutoCloseOnClose:
  1406. * @ctxt: an HTML parser context
  1407. * @newtag: The new tag name
  1408. * @force: force the tag closure
  1409. *
  1410. * The HTML DTD allows an ending tag to implicitly close other tags.
  1411. */
  1412. static void
  1413. htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
  1414. {
  1415. const htmlElemDesc *info;
  1416. int i, priority;
  1417. priority = htmlGetEndPriority(newtag);
  1418. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  1419. if (xmlStrEqual(newtag, ctxt->nameTab[i]))
  1420. break;
  1421. /*
  1422. * A misplaced endtag can only close elements with lower
  1423. * or equal priority, so if we find an element with higher
  1424. * priority before we find an element with
  1425. * matching name, we just ignore this endtag
  1426. */
  1427. if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
  1428. return;
  1429. }
  1430. if (i < 0)
  1431. return;
  1432. while (!xmlStrEqual(newtag, ctxt->name)) {
  1433. info = htmlTagLookup(ctxt->name);
  1434. if ((info != NULL) && (info->endTag == 3)) {
  1435. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  1436. "Opening and ending tag mismatch: %s and %s\n",
  1437. newtag, ctxt->name);
  1438. }
  1439. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1440. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1441. htmlnamePop(ctxt);
  1442. }
  1443. }
  1444. /**
  1445. * htmlAutoCloseOnEnd:
  1446. * @ctxt: an HTML parser context
  1447. *
  1448. * Close all remaining tags at the end of the stream
  1449. */
  1450. static void
  1451. htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
  1452. {
  1453. int i;
  1454. if (ctxt->nameNr == 0)
  1455. return;
  1456. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  1457. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1458. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1459. htmlnamePop(ctxt);
  1460. }
  1461. }
  1462. /**
  1463. * htmlAutoClose:
  1464. * @ctxt: an HTML parser context
  1465. * @newtag: The new tag name or NULL
  1466. *
  1467. * The HTML DTD allows a tag to implicitly close other tags.
  1468. * The list is kept in htmlStartClose array. This function is
  1469. * called when a new tag has been detected and generates the
  1470. * appropriates closes if possible/needed.
  1471. * If newtag is NULL this mean we are at the end of the resource
  1472. * and we should check
  1473. */
  1474. static void
  1475. htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
  1476. {
  1477. while ((newtag != NULL) && (ctxt->name != NULL) &&
  1478. (htmlCheckAutoClose(newtag, ctxt->name))) {
  1479. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1480. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1481. htmlnamePop(ctxt);
  1482. }
  1483. if (newtag == NULL) {
  1484. htmlAutoCloseOnEnd(ctxt);
  1485. return;
  1486. }
  1487. while ((newtag == NULL) && (ctxt->name != NULL) &&
  1488. ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
  1489. (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
  1490. (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
  1491. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1492. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1493. htmlnamePop(ctxt);
  1494. }
  1495. }
  1496. /**
  1497. * htmlAutoCloseTag:
  1498. * @doc: the HTML document
  1499. * @name: The tag name
  1500. * @elem: the HTML element
  1501. *
  1502. * The HTML DTD allows a tag to implicitly close other tags.
  1503. * The list is kept in htmlStartClose array. This function checks
  1504. * if the element or one of it's children would autoclose the
  1505. * given tag.
  1506. *
  1507. * Returns 1 if autoclose, 0 otherwise
  1508. */
  1509. int
  1510. htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
  1511. htmlNodePtr child;
  1512. if (elem == NULL) return(1);
  1513. if (xmlStrEqual(name, elem->name)) return(0);
  1514. if (htmlCheckAutoClose(elem->name, name)) return(1);
  1515. child = elem->children;
  1516. while (child != NULL) {
  1517. if (htmlAutoCloseTag(doc, name, child)) return(1);
  1518. child = child->next;
  1519. }
  1520. return(0);
  1521. }
  1522. /**
  1523. * htmlIsAutoClosed:
  1524. * @doc: the HTML document
  1525. * @elem: the HTML element
  1526. *
  1527. * The HTML DTD allows a tag to implicitly close other tags.
  1528. * The list is kept in htmlStartClose array. This function checks
  1529. * if a tag is autoclosed by one of it's child
  1530. *
  1531. * Returns 1 if autoclosed, 0 otherwise
  1532. */
  1533. int
  1534. htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
  1535. htmlNodePtr child;
  1536. if (elem == NULL) return(1);
  1537. child = elem->children;
  1538. while (child != NULL) {
  1539. if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
  1540. child = child->next;
  1541. }
  1542. return(0);
  1543. }
  1544. /**
  1545. * htmlCheckImplied:
  1546. * @ctxt: an HTML parser context
  1547. * @newtag: The new tag name
  1548. *
  1549. * The HTML DTD allows a tag to exists only implicitly
  1550. * called when a new tag has been detected and generates the
  1551. * appropriates implicit tags if missing
  1552. */
  1553. static void
  1554. htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
  1555. int i;
  1556. if (ctxt->options & HTML_PARSE_NOIMPLIED)
  1557. return;
  1558. if (!htmlOmittedDefaultValue)
  1559. return;
  1560. if (xmlStrEqual(newtag, BAD_CAST"html"))
  1561. return;
  1562. if (ctxt->nameNr <= 0) {
  1563. htmlnamePush(ctxt, BAD_CAST"html");
  1564. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1565. ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
  1566. }
  1567. if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
  1568. return;
  1569. if ((ctxt->nameNr <= 1) &&
  1570. ((xmlStrEqual(newtag, BAD_CAST"script")) ||
  1571. (xmlStrEqual(newtag, BAD_CAST"style")) ||
  1572. (xmlStrEqual(newtag, BAD_CAST"meta")) ||
  1573. (xmlStrEqual(newtag, BAD_CAST"link")) ||
  1574. (xmlStrEqual(newtag, BAD_CAST"title")) ||
  1575. (xmlStrEqual(newtag, BAD_CAST"base")))) {
  1576. if (ctxt->html >= 3) {
  1577. /* we already saw or generated an <head> before */
  1578. return;
  1579. }
  1580. /*
  1581. * dropped OBJECT ... i you put it first BODY will be
  1582. * assumed !
  1583. */
  1584. htmlnamePush(ctxt, BAD_CAST"head");
  1585. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1586. ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
  1587. } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
  1588. (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
  1589. (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
  1590. if (ctxt->html >= 10) {
  1591. /* we already saw or generated a <body> before */
  1592. return;
  1593. }
  1594. for (i = 0;i < ctxt->nameNr;i++) {
  1595. if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
  1596. return;
  1597. }
  1598. if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
  1599. return;
  1600. }
  1601. }
  1602. htmlnamePush(ctxt, BAD_CAST"body");
  1603. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1604. ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
  1605. }
  1606. }
  1607. /**
  1608. * htmlCheckParagraph
  1609. * @ctxt: an HTML parser context
  1610. *
  1611. * Check whether a p element need to be implied before inserting
  1612. * characters in the current element.
  1613. *
  1614. * Returns 1 if a paragraph has been inserted, 0 if not and -1
  1615. * in case of error.
  1616. */
  1617. static int
  1618. htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
  1619. const xmlChar *tag;
  1620. int i;
  1621. if (ctxt == NULL)
  1622. return(-1);
  1623. tag = ctxt->name;
  1624. if (tag == NULL) {
  1625. htmlAutoClose(ctxt, BAD_CAST"p");
  1626. htmlCheckImplied(ctxt, BAD_CAST"p");
  1627. htmlnamePush(ctxt, BAD_CAST"p");
  1628. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1629. ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
  1630. return(1);
  1631. }
  1632. if (!htmlOmittedDefaultValue)
  1633. return(0);
  1634. for (i = 0; htmlNoContentElements[i] != NULL; i++) {
  1635. if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
  1636. htmlAutoClose(ctxt, BAD_CAST"p");
  1637. htmlCheckImplied(ctxt, BAD_CAST"p");
  1638. htmlnamePush(ctxt, BAD_CAST"p");
  1639. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1640. ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
  1641. return(1);
  1642. }
  1643. }
  1644. return(0);
  1645. }
  1646. /**
  1647. * htmlIsScriptAttribute:
  1648. * @name: an attribute name
  1649. *
  1650. * Check if an attribute is of content type Script
  1651. *
  1652. * Returns 1 is the attribute is a script 0 otherwise
  1653. */
  1654. int
  1655. htmlIsScriptAttribute(const xmlChar *name) {
  1656. unsigned int i;
  1657. if (name == NULL)
  1658. return(0);
  1659. /*
  1660. * all script attributes start with 'on'
  1661. */
  1662. if ((name[0] != 'o') || (name[1] != 'n'))
  1663. return(0);
  1664. for (i = 0;
  1665. i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
  1666. i++) {
  1667. if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
  1668. return(1);
  1669. }
  1670. return(0);
  1671. }
  1672. /************************************************************************
  1673. * *
  1674. * The list of HTML predefined entities *
  1675. * *
  1676. ************************************************************************/
  1677. static const htmlEntityDesc html40EntitiesTable[] = {
  1678. /*
  1679. * the 4 absolute ones, plus apostrophe.
  1680. */
  1681. { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
  1682. { 38, "amp", "ampersand, U+0026 ISOnum" },
  1683. { 39, "apos", "single quote" },
  1684. { 60, "lt", "less-than sign, U+003C ISOnum" },
  1685. { 62, "gt", "greater-than sign, U+003E ISOnum" },
  1686. /*
  1687. * A bunch still in the 128-255 range
  1688. * Replacing them depend really on the charset used.
  1689. */
  1690. { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
  1691. { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
  1692. { 162, "cent", "cent sign, U+00A2 ISOnum" },
  1693. { 163, "pound","pound sign, U+00A3 ISOnum" },
  1694. { 164, "curren","currency sign, U+00A4 ISOnum" },
  1695. { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
  1696. { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
  1697. { 167, "sect", "section sign, U+00A7 ISOnum" },
  1698. { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
  1699. { 169, "copy", "copyright sign, U+00A9 ISOnum" },
  1700. { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
  1701. { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
  1702. { 172, "not", "not sign, U+00AC ISOnum" },
  1703. { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
  1704. { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
  1705. { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
  1706. { 176, "deg", "degree sign, U+00B0 ISOnum" },
  1707. { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
  1708. { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
  1709. { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
  1710. { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
  1711. { 181, "micro","micro sign, U+00B5 ISOnum" },
  1712. { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
  1713. { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
  1714. { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
  1715. { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
  1716. { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
  1717. { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
  1718. { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
  1719. { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
  1720. { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
  1721. { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
  1722. { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
  1723. { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
  1724. { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
  1725. { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
  1726. { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
  1727. { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
  1728. { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
  1729. { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
  1730. { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
  1731. { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
  1732. { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
  1733. { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
  1734. { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
  1735. { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
  1736. { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
  1737. { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
  1738. { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
  1739. { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
  1740. { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
  1741. { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
  1742. { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
  1743. { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
  1744. { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
  1745. { 215, "times","multiplication sign, U+00D7 ISOnum" },
  1746. { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
  1747. { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
  1748. { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
  1749. { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
  1750. { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
  1751. { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
  1752. { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
  1753. { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
  1754. { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
  1755. { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
  1756. { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
  1757. { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
  1758. { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
  1759. { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
  1760. { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
  1761. { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
  1762. { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
  1763. { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
  1764. { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
  1765. { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
  1766. { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
  1767. { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
  1768. { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
  1769. { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
  1770. { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
  1771. { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
  1772. { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
  1773. { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
  1774. { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
  1775. { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
  1776. { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
  1777. { 247, "divide","division sign, U+00F7 ISOnum" },
  1778. { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
  1779. { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
  1780. { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
  1781. { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
  1782. { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
  1783. { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
  1784. { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
  1785. { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
  1786. { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
  1787. { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
  1788. { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
  1789. { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
  1790. { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
  1791. /*
  1792. * Anything below should really be kept as entities references
  1793. */
  1794. { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
  1795. { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
  1796. { 732, "tilde","small tilde, U+02DC ISOdia" },
  1797. { 913, "Alpha","greek capital letter alpha, U+0391" },
  1798. { 914, "Beta", "greek capital letter beta, U+0392" },
  1799. { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
  1800. { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
  1801. { 917, "Epsilon","greek capital letter epsilon, U+0395" },
  1802. { 918, "Zeta", "greek capital letter zeta, U+0396" },
  1803. { 919, "Eta", "greek capital letter eta, U+0397" },
  1804. { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
  1805. { 921, "Iota", "greek capital letter iota, U+0399" },
  1806. { 922, "Kappa","greek capital letter kappa, U+039A" },
  1807. { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
  1808. { 924, "Mu", "greek capital letter mu, U+039C" },
  1809. { 925, "Nu", "greek capital letter nu, U+039D" },
  1810. { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
  1811. { 927, "Omicron","greek capital letter omicron, U+039F" },
  1812. { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
  1813. { 929, "Rho", "greek capital letter rho, U+03A1" },
  1814. { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
  1815. { 932, "Tau", "greek capital letter tau, U+03A4" },
  1816. { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
  1817. { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
  1818. { 935, "Chi", "greek capital letter chi, U+03A7" },
  1819. { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
  1820. { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
  1821. { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
  1822. { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
  1823. { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
  1824. { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
  1825. { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
  1826. { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
  1827. { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
  1828. { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
  1829. { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
  1830. { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
  1831. { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
  1832. { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
  1833. { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
  1834. { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
  1835. { 959, "omicron","greek small letter omicron, U+03BF NEW" },
  1836. { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
  1837. { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
  1838. { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
  1839. { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
  1840. { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
  1841. { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
  1842. { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
  1843. { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
  1844. { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
  1845. { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
  1846. { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
  1847. { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
  1848. { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
  1849. { 8194, "ensp", "en space, U+2002 ISOpub" },
  1850. { 8195, "emsp", "em space, U+2003 ISOpub" },
  1851. { 8201, "thinsp","thin space, U+2009 ISOpub" },
  1852. { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
  1853. { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
  1854. { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
  1855. { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
  1856. { 8211, "ndash","en dash, U+2013 ISOpub" },
  1857. { 8212, "mdash","em dash, U+2014 ISOpub" },
  1858. { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
  1859. { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
  1860. { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
  1861. { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
  1862. { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
  1863. { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
  1864. { 8224, "dagger","dagger, U+2020 ISOpub" },
  1865. { 8225, "Dagger","double dagger, U+2021 ISOpub" },
  1866. { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
  1867. { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
  1868. { 8240, "permil","per mille sign, U+2030 ISOtech" },
  1869. { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
  1870. { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
  1871. { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
  1872. { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
  1873. { 8254, "oline","overline = spacing overscore, U+203E NEW" },
  1874. { 8260, "frasl","fraction slash, U+2044 NEW" },
  1875. { 8364, "euro", "euro sign, U+20AC NEW" },
  1876. { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
  1877. { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
  1878. { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
  1879. { 8482, "trade","trade mark sign, U+2122 ISOnum" },
  1880. { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
  1881. { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
  1882. { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
  1883. { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
  1884. { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
  1885. { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
  1886. { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
  1887. { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
  1888. { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
  1889. { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
  1890. { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
  1891. { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
  1892. { 8704, "forall","for all, U+2200 ISOtech" },
  1893. { 8706, "part", "partial differential, U+2202 ISOtech" },
  1894. { 8707, "exist","there exists, U+2203 ISOtech" },
  1895. { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
  1896. { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
  1897. { 8712, "isin", "element of, U+2208 ISOtech" },
  1898. { 8713, "notin","not an element of, U+2209 ISOtech" },
  1899. { 8715, "ni", "contains as member, U+220B ISOtech" },
  1900. { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
  1901. { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
  1902. { 8722, "minus","minus sign, U+2212 ISOtech" },
  1903. { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
  1904. { 8730, "radic","square root = radical sign, U+221A ISOtech" },
  1905. { 8733, "prop", "proportional to, U+221D ISOtech" },
  1906. { 8734, "infin","infinity, U+221E ISOtech" },
  1907. { 8736, "ang", "angle, U+2220 ISOamso" },
  1908. { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
  1909. { 8744, "or", "logical or = vee, U+2228 ISOtech" },
  1910. { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
  1911. { 8746, "cup", "union = cup, U+222A ISOtech" },
  1912. { 8747, "int", "integral, U+222B ISOtech" },
  1913. { 8756, "there4","therefore, U+2234 ISOtech" },
  1914. { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
  1915. { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
  1916. { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
  1917. { 8800, "ne", "not equal to, U+2260 ISOtech" },
  1918. { 8801, "equiv","identical to, U+2261 ISOtech" },
  1919. { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
  1920. { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
  1921. { 8834, "sub", "subset of, U+2282 ISOtech" },
  1922. { 8835, "sup", "superset of, U+2283 ISOtech" },
  1923. { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
  1924. { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
  1925. { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
  1926. { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
  1927. { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
  1928. { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
  1929. { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
  1930. { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
  1931. { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
  1932. { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
  1933. { 8971, "rfloor","right floor, U+230B ISOamsc" },
  1934. { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
  1935. { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
  1936. { 9674, "loz", "lozenge, U+25CA ISOpub" },
  1937. { 9824, "spades","black spade suit, U+2660 ISOpub" },
  1938. { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
  1939. { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
  1940. { 9830, "diams","black diamond suit, U+2666 ISOpub" },
  1941. };
  1942. /************************************************************************
  1943. * *
  1944. * Commodity functions to handle entities *
  1945. * *
  1946. ************************************************************************/
  1947. /*
  1948. * Macro used to grow the current buffer.
  1949. */
  1950. #define growBuffer(buffer) { \
  1951. xmlChar *tmp; \
  1952. buffer##_size *= 2; \
  1953. tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
  1954. if (tmp == NULL) { \
  1955. htmlErrMemory(ctxt, "growing buffer\n"); \
  1956. xmlFree(buffer); \
  1957. return(NULL); \
  1958. } \
  1959. buffer = tmp; \
  1960. }
  1961. /**
  1962. * htmlEntityLookup:
  1963. * @name: the entity name
  1964. *
  1965. * Lookup the given entity in EntitiesTable
  1966. *
  1967. * TODO: the linear scan is really ugly, an hash table is really needed.
  1968. *
  1969. * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
  1970. */
  1971. const htmlEntityDesc *
  1972. htmlEntityLookup(const xmlChar *name) {
  1973. unsigned int i;
  1974. for (i = 0;i < (sizeof(html40EntitiesTable)/
  1975. sizeof(html40EntitiesTable[0]));i++) {
  1976. if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
  1977. return((htmlEntityDescPtr) &html40EntitiesTable[i]);
  1978. }
  1979. }
  1980. return(NULL);
  1981. }
  1982. /**
  1983. * htmlEntityValueLookup:
  1984. * @value: the entity's unicode value
  1985. *
  1986. * Lookup the given entity in EntitiesTable
  1987. *
  1988. * TODO: the linear scan is really ugly, an hash table is really needed.
  1989. *
  1990. * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
  1991. */
  1992. const htmlEntityDesc *
  1993. htmlEntityValueLookup(unsigned int value) {
  1994. unsigned int i;
  1995. for (i = 0;i < (sizeof(html40EntitiesTable)/
  1996. sizeof(html40EntitiesTable[0]));i++) {
  1997. if (html40EntitiesTable[i].value >= value) {
  1998. if (html40EntitiesTable[i].value > value)
  1999. break;
  2000. return((htmlEntityDescPtr) &html40EntitiesTable[i]);
  2001. }
  2002. }
  2003. return(NULL);
  2004. }
  2005. /**
  2006. * UTF8ToHtml:
  2007. * @out: a pointer to an array of bytes to store the result
  2008. * @outlen: the length of @out
  2009. * @in: a pointer to an array of UTF-8 chars
  2010. * @inlen: the length of @in
  2011. *
  2012. * Take a block of UTF-8 chars in and try to convert it to an ASCII
  2013. * plus HTML entities block of chars out.
  2014. *
  2015. * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
  2016. * The value of @inlen after return is the number of octets consumed
  2017. * as the return value is positive, else unpredictable.
  2018. * The value of @outlen after return is the number of octets consumed.
  2019. */
  2020. int
  2021. UTF8ToHtml(unsigned char* out, int *outlen,
  2022. const unsigned char* in, int *inlen) {
  2023. const unsigned char* processed = in;
  2024. const unsigned char* outend;
  2025. const unsigned char* outstart = out;
  2026. const unsigned char* instart = in;
  2027. const unsigned char* inend;
  2028. unsigned int c, d;
  2029. int trailing;
  2030. if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
  2031. if (in == NULL) {
  2032. /*
  2033. * initialization nothing to do
  2034. */
  2035. *outlen = 0;
  2036. *inlen = 0;
  2037. return(0);
  2038. }
  2039. inend = in + (*inlen);
  2040. outend = out + (*outlen);
  2041. while (in < inend) {
  2042. d = *in++;
  2043. if (d < 0x80) { c= d; trailing= 0; }
  2044. else if (d < 0xC0) {
  2045. /* trailing byte in leading position */
  2046. *outlen = out - outstart;
  2047. *inlen = processed - instart;
  2048. return(-2);
  2049. } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
  2050. else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
  2051. else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
  2052. else {
  2053. /* no chance for this in Ascii */
  2054. *outlen = out - outstart;
  2055. *inlen = processed - instart;
  2056. return(-2);
  2057. }
  2058. if (inend - in < trailing) {
  2059. break;
  2060. }
  2061. for ( ; trailing; trailing--) {
  2062. if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
  2063. break;
  2064. c <<= 6;
  2065. c |= d & 0x3F;
  2066. }
  2067. /* assertion: c is a single UTF-4 value */
  2068. if (c < 0x80) {
  2069. if (out + 1 >= outend)
  2070. break;
  2071. *out++ = c;
  2072. } else {
  2073. int len;
  2074. const htmlEntityDesc * ent;
  2075. const char *cp;
  2076. char nbuf[16];
  2077. /*
  2078. * Try to lookup a predefined HTML entity for it
  2079. */
  2080. ent = htmlEntityValueLookup(c);
  2081. if (ent == NULL) {
  2082. snprintf(nbuf, sizeof(nbuf), "#%u", c);
  2083. cp = nbuf;
  2084. }
  2085. else
  2086. cp = ent->name;
  2087. len = strlen(cp);
  2088. if (out + 2 + len >= outend)
  2089. break;
  2090. *out++ = '&';
  2091. memcpy(out, cp, len);
  2092. out += len;
  2093. *out++ = ';';
  2094. }
  2095. processed = in;
  2096. }
  2097. *outlen = out - outstart;
  2098. *inlen = processed - instart;
  2099. return(0);
  2100. }
  2101. /**
  2102. * htmlEncodeEntities:
  2103. * @out: a pointer to an array of bytes to store the result
  2104. * @outlen: the length of @out
  2105. * @in: a pointer to an array of UTF-8 chars
  2106. * @inlen: the length of @in
  2107. * @quoteChar: the quote character to escape (' or ") or zero.
  2108. *
  2109. * Take a block of UTF-8 chars in and try to convert it to an ASCII
  2110. * plus HTML entities block of chars out.
  2111. *
  2112. * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
  2113. * The value of @inlen after return is the number of octets consumed
  2114. * as the return value is positive, else unpredictable.
  2115. * The value of @outlen after return is the number of octets consumed.
  2116. */
  2117. int
  2118. htmlEncodeEntities(unsigned char* out, int *outlen,
  2119. const unsigned char* in, int *inlen, int quoteChar) {
  2120. const unsigned char* processed = in;
  2121. const unsigned char* outend;
  2122. const unsigned char* outstart = out;
  2123. const unsigned char* instart = in;
  2124. const unsigned char* inend;
  2125. unsigned int c, d;
  2126. int trailing;
  2127. if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
  2128. return(-1);
  2129. outend = out + (*outlen);
  2130. inend = in + (*inlen);
  2131. while (in < inend) {
  2132. d = *in++;
  2133. if (d < 0x80) { c= d; trailing= 0; }
  2134. else if (d < 0xC0) {
  2135. /* trailing byte in leading position */
  2136. *outlen = out - outstart;
  2137. *inlen = processed - instart;
  2138. return(-2);
  2139. } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
  2140. else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
  2141. else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
  2142. else {
  2143. /* no chance for this in Ascii */
  2144. *outlen = out - outstart;
  2145. *inlen = processed - instart;
  2146. return(-2);
  2147. }
  2148. if (inend - in < trailing)
  2149. break;
  2150. while (trailing--) {
  2151. if (((d= *in++) & 0xC0) != 0x80) {
  2152. *outlen = out - outstart;
  2153. *inlen = processed - instart;
  2154. return(-2);
  2155. }
  2156. c <<= 6;
  2157. c |= d & 0x3F;
  2158. }
  2159. /* assertion: c is a single UTF-4 value */
  2160. if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
  2161. (c != '&') && (c != '<') && (c != '>')) {
  2162. if (out >= outend)
  2163. break;
  2164. *out++ = c;
  2165. } else {
  2166. const htmlEntityDesc * ent;
  2167. const char *cp;
  2168. char nbuf[16];
  2169. int len;
  2170. /*
  2171. * Try to lookup a predefined HTML entity for it
  2172. */
  2173. ent = htmlEntityValueLookup(c);
  2174. if (ent == NULL) {
  2175. snprintf(nbuf, sizeof(nbuf), "#%u", c);
  2176. cp = nbuf;
  2177. }
  2178. else
  2179. cp = ent->name;
  2180. len = strlen(cp);
  2181. if (out + 2 + len > outend)
  2182. break;
  2183. *out++ = '&';
  2184. memcpy(out, cp, len);
  2185. out += len;
  2186. *out++ = ';';
  2187. }
  2188. processed = in;
  2189. }
  2190. *outlen = out - outstart;
  2191. *inlen = processed - instart;
  2192. return(0);
  2193. }
  2194. /************************************************************************
  2195. * *
  2196. * Commodity functions to handle streams *
  2197. * *
  2198. ************************************************************************/
  2199. #ifdef LIBXML_PUSH_ENABLED
  2200. /**
  2201. * htmlNewInputStream:
  2202. * @ctxt: an HTML parser context
  2203. *
  2204. * Create a new input stream structure
  2205. * Returns the new input stream or NULL
  2206. */
  2207. static htmlParserInputPtr
  2208. htmlNewInputStream(htmlParserCtxtPtr ctxt) {
  2209. htmlParserInputPtr input;
  2210. input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
  2211. if (input == NULL) {
  2212. htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
  2213. return(NULL);
  2214. }
  2215. memset(input, 0, sizeof(htmlParserInput));
  2216. input->filename = NULL;
  2217. input->directory = NULL;
  2218. input->base = NULL;
  2219. input->cur = NULL;
  2220. input->buf = NULL;
  2221. input->line = 1;
  2222. input->col = 1;
  2223. input->buf = NULL;
  2224. input->free = NULL;
  2225. input->version = NULL;
  2226. input->consumed = 0;
  2227. input->length = 0;
  2228. return(input);
  2229. }
  2230. #endif
  2231. /************************************************************************
  2232. * *
  2233. * Commodity functions, cleanup needed ? *
  2234. * *
  2235. ************************************************************************/
  2236. /*
  2237. * all tags allowing pc data from the html 4.01 loose dtd
  2238. * NOTE: it might be more appropriate to integrate this information
  2239. * into the html40ElementTable array but I don't want to risk any
  2240. * binary incompatibility
  2241. */
  2242. static const char *allowPCData[] = {
  2243. "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
  2244. "blockquote", "body", "button", "caption", "center", "cite", "code",
  2245. "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
  2246. "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
  2247. "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
  2248. "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
  2249. };
  2250. /**
  2251. * areBlanks:
  2252. * @ctxt: an HTML parser context
  2253. * @str: a xmlChar *
  2254. * @len: the size of @str
  2255. *
  2256. * Is this a sequence of blank chars that one can ignore ?
  2257. *
  2258. * Returns 1 if ignorable 0 otherwise.
  2259. */
  2260. static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
  2261. unsigned int i;
  2262. int j;
  2263. xmlNodePtr lastChild;
  2264. xmlDtdPtr dtd;
  2265. for (j = 0;j < len;j++)
  2266. if (!(IS_BLANK_CH(str[j]))) return(0);
  2267. if (CUR == 0) return(1);
  2268. if (CUR != '<') return(0);
  2269. if (ctxt->name == NULL)
  2270. return(1);
  2271. if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
  2272. return(1);
  2273. if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
  2274. return(1);
  2275. /* Only strip CDATA children of the body tag for strict HTML DTDs */
  2276. if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
  2277. dtd = xmlGetIntSubset(ctxt->myDoc);
  2278. if (dtd != NULL && dtd->ExternalID != NULL) {
  2279. if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
  2280. !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
  2281. return(1);
  2282. }
  2283. }
  2284. if (ctxt->node == NULL) return(0);
  2285. lastChild = xmlGetLastChild(ctxt->node);
  2286. while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
  2287. lastChild = lastChild->prev;
  2288. if (lastChild == NULL) {
  2289. if ((ctxt->node->type != XML_ELEMENT_NODE) &&
  2290. (ctxt->node->content != NULL)) return(0);
  2291. /* keep ws in constructs like ...<b> </b>...
  2292. for all tags "b" allowing PCDATA */
  2293. for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
  2294. if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
  2295. return(0);
  2296. }
  2297. }
  2298. } else if (xmlNodeIsText(lastChild)) {
  2299. return(0);
  2300. } else {
  2301. /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
  2302. for all tags "p" allowing PCDATA */
  2303. for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
  2304. if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
  2305. return(0);
  2306. }
  2307. }
  2308. }
  2309. return(1);
  2310. }
  2311. /**
  2312. * htmlNewDocNoDtD:
  2313. * @URI: URI for the dtd, or NULL
  2314. * @ExternalID: the external ID of the DTD, or NULL
  2315. *
  2316. * Creates a new HTML document without a DTD node if @URI and @ExternalID
  2317. * are NULL
  2318. *
  2319. * Returns a new document, do not initialize the DTD if not provided
  2320. */
  2321. htmlDocPtr
  2322. htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
  2323. xmlDocPtr cur;
  2324. /*
  2325. * Allocate a new document and fill the fields.
  2326. */
  2327. cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
  2328. if (cur == NULL) {
  2329. htmlErrMemory(NULL, "HTML document creation failed\n");
  2330. return(NULL);
  2331. }
  2332. memset(cur, 0, sizeof(xmlDoc));
  2333. cur->type = XML_HTML_DOCUMENT_NODE;
  2334. cur->version = NULL;
  2335. cur->intSubset = NULL;
  2336. cur->doc = cur;
  2337. cur->name = NULL;
  2338. cur->children = NULL;
  2339. cur->extSubset = NULL;
  2340. cur->oldNs = NULL;
  2341. cur->encoding = NULL;
  2342. cur->standalone = 1;
  2343. cur->compression = 0;
  2344. cur->ids = NULL;
  2345. cur->refs = NULL;
  2346. cur->_private = NULL;
  2347. cur->charset = XML_CHAR_ENCODING_UTF8;
  2348. cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
  2349. if ((ExternalID != NULL) ||
  2350. (URI != NULL))
  2351. xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
  2352. if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
  2353. xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
  2354. return(cur);
  2355. }
  2356. /**
  2357. * htmlNewDoc:
  2358. * @URI: URI for the dtd, or NULL
  2359. * @ExternalID: the external ID of the DTD, or NULL
  2360. *
  2361. * Creates a new HTML document
  2362. *
  2363. * Returns a new document
  2364. */
  2365. htmlDocPtr
  2366. htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
  2367. if ((URI == NULL) && (ExternalID == NULL))
  2368. return(htmlNewDocNoDtD(
  2369. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
  2370. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
  2371. return(htmlNewDocNoDtD(URI, ExternalID));
  2372. }
  2373. /************************************************************************
  2374. * *
  2375. * The parser itself *
  2376. * Relates to http://www.w3.org/TR/html40 *
  2377. * *
  2378. ************************************************************************/
  2379. /************************************************************************
  2380. * *
  2381. * The parser itself *
  2382. * *
  2383. ************************************************************************/
  2384. static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
  2385. /**
  2386. * htmlParseHTMLName:
  2387. * @ctxt: an HTML parser context
  2388. *
  2389. * parse an HTML tag or attribute name, note that we convert it to lowercase
  2390. * since HTML names are not case-sensitive.
  2391. *
  2392. * Returns the Tag Name parsed or NULL
  2393. */
  2394. static const xmlChar *
  2395. htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
  2396. int i = 0;
  2397. xmlChar loc[HTML_PARSER_BUFFER_SIZE];
  2398. if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
  2399. (CUR != ':') && (CUR != '.')) return(NULL);
  2400. while ((i < HTML_PARSER_BUFFER_SIZE) &&
  2401. ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
  2402. (CUR == ':') || (CUR == '-') || (CUR == '_') ||
  2403. (CUR == '.'))) {
  2404. if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
  2405. else loc[i] = CUR;
  2406. i++;
  2407. NEXT;
  2408. }
  2409. return(xmlDictLookup(ctxt->dict, loc, i));
  2410. }
  2411. /**
  2412. * htmlParseHTMLName_nonInvasive:
  2413. * @ctxt: an HTML parser context
  2414. *
  2415. * parse an HTML tag or attribute name, note that we convert it to lowercase
  2416. * since HTML names are not case-sensitive, this doesn't consume the data
  2417. * from the stream, it's a look-ahead
  2418. *
  2419. * Returns the Tag Name parsed or NULL
  2420. */
  2421. static const xmlChar *
  2422. htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
  2423. int i = 0;
  2424. xmlChar loc[HTML_PARSER_BUFFER_SIZE];
  2425. if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
  2426. (NXT(1) != ':')) return(NULL);
  2427. while ((i < HTML_PARSER_BUFFER_SIZE) &&
  2428. ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
  2429. (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
  2430. if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
  2431. else loc[i] = NXT(1+i);
  2432. i++;
  2433. }
  2434. return(xmlDictLookup(ctxt->dict, loc, i));
  2435. }
  2436. /**
  2437. * htmlParseName:
  2438. * @ctxt: an HTML parser context
  2439. *
  2440. * parse an HTML name, this routine is case sensitive.
  2441. *
  2442. * Returns the Name parsed or NULL
  2443. */
  2444. static const xmlChar *
  2445. htmlParseName(htmlParserCtxtPtr ctxt) {
  2446. const xmlChar *in;
  2447. const xmlChar *ret;
  2448. int count = 0;
  2449. GROW;
  2450. /*
  2451. * Accelerator for simple ASCII names
  2452. */
  2453. in = ctxt->input->cur;
  2454. if (((*in >= 0x61) && (*in <= 0x7A)) ||
  2455. ((*in >= 0x41) && (*in <= 0x5A)) ||
  2456. (*in == '_') || (*in == ':')) {
  2457. in++;
  2458. while (((*in >= 0x61) && (*in <= 0x7A)) ||
  2459. ((*in >= 0x41) && (*in <= 0x5A)) ||
  2460. ((*in >= 0x30) && (*in <= 0x39)) ||
  2461. (*in == '_') || (*in == '-') ||
  2462. (*in == ':') || (*in == '.'))
  2463. in++;
  2464. if (in == ctxt->input->end)
  2465. return(NULL);
  2466. if ((*in > 0) && (*in < 0x80)) {
  2467. count = in - ctxt->input->cur;
  2468. ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
  2469. ctxt->input->cur = in;
  2470. ctxt->input->col += count;
  2471. return(ret);
  2472. }
  2473. }
  2474. return(htmlParseNameComplex(ctxt));
  2475. }
  2476. static const xmlChar *
  2477. htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
  2478. int len = 0, l;
  2479. int c;
  2480. int count = 0;
  2481. const xmlChar *base = ctxt->input->base;
  2482. /*
  2483. * Handler for more complex cases
  2484. */
  2485. GROW;
  2486. c = CUR_CHAR(l);
  2487. if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
  2488. (!IS_LETTER(c) && (c != '_') &&
  2489. (c != ':'))) {
  2490. return(NULL);
  2491. }
  2492. while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
  2493. ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
  2494. (c == '.') || (c == '-') ||
  2495. (c == '_') || (c == ':') ||
  2496. (IS_COMBINING(c)) ||
  2497. (IS_EXTENDER(c)))) {
  2498. if (count++ > 100) {
  2499. count = 0;
  2500. GROW;
  2501. }
  2502. len += l;
  2503. NEXTL(l);
  2504. c = CUR_CHAR(l);
  2505. if (ctxt->input->base != base) {
  2506. /*
  2507. * We changed encoding from an unknown encoding
  2508. * Input buffer changed location, so we better start again
  2509. */
  2510. return(htmlParseNameComplex(ctxt));
  2511. }
  2512. }
  2513. if (ctxt->input->cur - ctxt->input->base < len) {
  2514. /* Sanity check */
  2515. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  2516. "unexpected change of input buffer", NULL, NULL);
  2517. return (NULL);
  2518. }
  2519. return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
  2520. }
  2521. /**
  2522. * htmlParseHTMLAttribute:
  2523. * @ctxt: an HTML parser context
  2524. * @stop: a char stop value
  2525. *
  2526. * parse an HTML attribute value till the stop (quote), if
  2527. * stop is 0 then it stops at the first space
  2528. *
  2529. * Returns the attribute parsed or NULL
  2530. */
  2531. static xmlChar *
  2532. htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
  2533. xmlChar *buffer = NULL;
  2534. int buffer_size = 0;
  2535. xmlChar *out = NULL;
  2536. const xmlChar *name = NULL;
  2537. const xmlChar *cur = NULL;
  2538. const htmlEntityDesc * ent;
  2539. /*
  2540. * allocate a translation buffer.
  2541. */
  2542. buffer_size = HTML_PARSER_BUFFER_SIZE;
  2543. buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
  2544. if (buffer == NULL) {
  2545. htmlErrMemory(ctxt, "buffer allocation failed\n");
  2546. return(NULL);
  2547. }
  2548. out = buffer;
  2549. /*
  2550. * Ok loop until we reach one of the ending chars
  2551. */
  2552. while ((CUR != 0) && (CUR != stop)) {
  2553. if ((stop == 0) && (CUR == '>')) break;
  2554. if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
  2555. if (CUR == '&') {
  2556. if (NXT(1) == '#') {
  2557. unsigned int c;
  2558. int bits;
  2559. c = htmlParseCharRef(ctxt);
  2560. if (c < 0x80)
  2561. { *out++ = c; bits= -6; }
  2562. else if (c < 0x800)
  2563. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2564. else if (c < 0x10000)
  2565. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2566. else
  2567. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2568. for ( ; bits >= 0; bits-= 6) {
  2569. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2570. }
  2571. if (out - buffer > buffer_size - 100) {
  2572. int indx = out - buffer;
  2573. growBuffer(buffer);
  2574. out = &buffer[indx];
  2575. }
  2576. } else {
  2577. ent = htmlParseEntityRef(ctxt, &name);
  2578. if (name == NULL) {
  2579. *out++ = '&';
  2580. if (out - buffer > buffer_size - 100) {
  2581. int indx = out - buffer;
  2582. growBuffer(buffer);
  2583. out = &buffer[indx];
  2584. }
  2585. } else if (ent == NULL) {
  2586. *out++ = '&';
  2587. cur = name;
  2588. while (*cur != 0) {
  2589. if (out - buffer > buffer_size - 100) {
  2590. int indx = out - buffer;
  2591. growBuffer(buffer);
  2592. out = &buffer[indx];
  2593. }
  2594. *out++ = *cur++;
  2595. }
  2596. } else {
  2597. unsigned int c;
  2598. int bits;
  2599. if (out - buffer > buffer_size - 100) {
  2600. int indx = out - buffer;
  2601. growBuffer(buffer);
  2602. out = &buffer[indx];
  2603. }
  2604. c = ent->value;
  2605. if (c < 0x80)
  2606. { *out++ = c; bits= -6; }
  2607. else if (c < 0x800)
  2608. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2609. else if (c < 0x10000)
  2610. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2611. else
  2612. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2613. for ( ; bits >= 0; bits-= 6) {
  2614. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2615. }
  2616. }
  2617. }
  2618. } else {
  2619. unsigned int c;
  2620. int bits, l;
  2621. if (out - buffer > buffer_size - 100) {
  2622. int indx = out - buffer;
  2623. growBuffer(buffer);
  2624. out = &buffer[indx];
  2625. }
  2626. c = CUR_CHAR(l);
  2627. if (c < 0x80)
  2628. { *out++ = c; bits= -6; }
  2629. else if (c < 0x800)
  2630. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2631. else if (c < 0x10000)
  2632. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2633. else
  2634. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2635. for ( ; bits >= 0; bits-= 6) {
  2636. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2637. }
  2638. NEXT;
  2639. }
  2640. }
  2641. *out = 0;
  2642. return(buffer);
  2643. }
  2644. /**
  2645. * htmlParseEntityRef:
  2646. * @ctxt: an HTML parser context
  2647. * @str: location to store the entity name
  2648. *
  2649. * parse an HTML ENTITY references
  2650. *
  2651. * [68] EntityRef ::= '&' Name ';'
  2652. *
  2653. * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
  2654. * if non-NULL *str will have to be freed by the caller.
  2655. */
  2656. const htmlEntityDesc *
  2657. htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
  2658. const xmlChar *name;
  2659. const htmlEntityDesc * ent = NULL;
  2660. if (str != NULL) *str = NULL;
  2661. if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
  2662. if (CUR == '&') {
  2663. NEXT;
  2664. name = htmlParseName(ctxt);
  2665. if (name == NULL) {
  2666. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  2667. "htmlParseEntityRef: no name\n", NULL, NULL);
  2668. } else {
  2669. GROW;
  2670. if (CUR == ';') {
  2671. if (str != NULL)
  2672. *str = name;
  2673. /*
  2674. * Lookup the entity in the table.
  2675. */
  2676. ent = htmlEntityLookup(name);
  2677. if (ent != NULL) /* OK that's ugly !!! */
  2678. NEXT;
  2679. } else {
  2680. htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
  2681. "htmlParseEntityRef: expecting ';'\n",
  2682. NULL, NULL);
  2683. if (str != NULL)
  2684. *str = name;
  2685. }
  2686. }
  2687. }
  2688. return(ent);
  2689. }
  2690. /**
  2691. * htmlParseAttValue:
  2692. * @ctxt: an HTML parser context
  2693. *
  2694. * parse a value for an attribute
  2695. * Note: the parser won't do substitution of entities here, this
  2696. * will be handled later in xmlStringGetNodeList, unless it was
  2697. * asked for ctxt->replaceEntities != 0
  2698. *
  2699. * Returns the AttValue parsed or NULL.
  2700. */
  2701. static xmlChar *
  2702. htmlParseAttValue(htmlParserCtxtPtr ctxt) {
  2703. xmlChar *ret = NULL;
  2704. if (CUR == '"') {
  2705. NEXT;
  2706. ret = htmlParseHTMLAttribute(ctxt, '"');
  2707. if (CUR != '"') {
  2708. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
  2709. "AttValue: \" expected\n", NULL, NULL);
  2710. } else
  2711. NEXT;
  2712. } else if (CUR == '\'') {
  2713. NEXT;
  2714. ret = htmlParseHTMLAttribute(ctxt, '\'');
  2715. if (CUR != '\'') {
  2716. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
  2717. "AttValue: ' expected\n", NULL, NULL);
  2718. } else
  2719. NEXT;
  2720. } else {
  2721. /*
  2722. * That's an HTMLism, the attribute value may not be quoted
  2723. */
  2724. ret = htmlParseHTMLAttribute(ctxt, 0);
  2725. if (ret == NULL) {
  2726. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
  2727. "AttValue: no value found\n", NULL, NULL);
  2728. }
  2729. }
  2730. return(ret);
  2731. }
  2732. /**
  2733. * htmlParseSystemLiteral:
  2734. * @ctxt: an HTML parser context
  2735. *
  2736. * parse an HTML Literal
  2737. *
  2738. * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
  2739. *
  2740. * Returns the SystemLiteral parsed or NULL
  2741. */
  2742. static xmlChar *
  2743. htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
  2744. size_t len = 0, startPosition = 0;
  2745. int err = 0;
  2746. int quote;
  2747. xmlChar *ret = NULL;
  2748. if ((CUR != '"') && (CUR != '\'')) {
  2749. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
  2750. "SystemLiteral \" or ' expected\n", NULL, NULL);
  2751. return(NULL);
  2752. }
  2753. quote = CUR;
  2754. NEXT;
  2755. if (CUR_PTR < BASE_PTR)
  2756. return(ret);
  2757. startPosition = CUR_PTR - BASE_PTR;
  2758. while ((CUR != 0) && (CUR != quote)) {
  2759. /* TODO: Handle UTF-8 */
  2760. if (!IS_CHAR_CH(CUR)) {
  2761. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2762. "Invalid char in SystemLiteral 0x%X\n", CUR);
  2763. err = 1;
  2764. }
  2765. NEXT;
  2766. len++;
  2767. }
  2768. if (CUR != quote) {
  2769. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2770. "Unfinished SystemLiteral\n", NULL, NULL);
  2771. } else {
  2772. NEXT;
  2773. if (err == 0)
  2774. ret = xmlStrndup((BASE_PTR+startPosition), len);
  2775. }
  2776. return(ret);
  2777. }
  2778. /**
  2779. * htmlParsePubidLiteral:
  2780. * @ctxt: an HTML parser context
  2781. *
  2782. * parse an HTML public literal
  2783. *
  2784. * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  2785. *
  2786. * Returns the PubidLiteral parsed or NULL.
  2787. */
  2788. static xmlChar *
  2789. htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
  2790. size_t len = 0, startPosition = 0;
  2791. int err = 0;
  2792. int quote;
  2793. xmlChar *ret = NULL;
  2794. if ((CUR != '"') && (CUR != '\'')) {
  2795. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
  2796. "PubidLiteral \" or ' expected\n", NULL, NULL);
  2797. return(NULL);
  2798. }
  2799. quote = CUR;
  2800. NEXT;
  2801. /*
  2802. * Name ::= (Letter | '_') (NameChar)*
  2803. */
  2804. if (CUR_PTR < BASE_PTR)
  2805. return(ret);
  2806. startPosition = CUR_PTR - BASE_PTR;
  2807. while ((CUR != 0) && (CUR != quote)) {
  2808. if (!IS_PUBIDCHAR_CH(CUR)) {
  2809. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2810. "Invalid char in PubidLiteral 0x%X\n", CUR);
  2811. err = 1;
  2812. }
  2813. len++;
  2814. NEXT;
  2815. }
  2816. if (CUR != quote) {
  2817. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2818. "Unfinished PubidLiteral\n", NULL, NULL);
  2819. } else {
  2820. NEXT;
  2821. if (err == 0)
  2822. ret = xmlStrndup((BASE_PTR + startPosition), len);
  2823. }
  2824. return(ret);
  2825. }
  2826. /**
  2827. * htmlParseScript:
  2828. * @ctxt: an HTML parser context
  2829. *
  2830. * parse the content of an HTML SCRIPT or STYLE element
  2831. * http://www.w3.org/TR/html4/sgml/dtd.html#Script
  2832. * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
  2833. * http://www.w3.org/TR/html4/types.html#type-script
  2834. * http://www.w3.org/TR/html4/types.html#h-6.15
  2835. * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
  2836. *
  2837. * Script data ( %Script; in the DTD) can be the content of the SCRIPT
  2838. * element and the value of intrinsic event attributes. User agents must
  2839. * not evaluate script data as HTML markup but instead must pass it on as
  2840. * data to a script engine.
  2841. * NOTES:
  2842. * - The content is passed like CDATA
  2843. * - the attributes for style and scripting "onXXX" are also described
  2844. * as CDATA but SGML allows entities references in attributes so their
  2845. * processing is identical as other attributes
  2846. */
  2847. static void
  2848. htmlParseScript(htmlParserCtxtPtr ctxt) {
  2849. xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
  2850. int nbchar = 0;
  2851. int cur,l;
  2852. SHRINK;
  2853. cur = CUR_CHAR(l);
  2854. while (cur != 0) {
  2855. if ((cur == '<') && (NXT(1) == '/')) {
  2856. /*
  2857. * One should break here, the specification is clear:
  2858. * Authors should therefore escape "</" within the content.
  2859. * Escape mechanisms are specific to each scripting or
  2860. * style sheet language.
  2861. *
  2862. * In recovery mode, only break if end tag match the
  2863. * current tag, effectively ignoring all tags inside the
  2864. * script/style block and treating the entire block as
  2865. * CDATA.
  2866. */
  2867. if (ctxt->recovery) {
  2868. if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
  2869. xmlStrlen(ctxt->name)) == 0)
  2870. {
  2871. break; /* while */
  2872. } else {
  2873. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  2874. "Element %s embeds close tag\n",
  2875. ctxt->name, NULL);
  2876. }
  2877. } else {
  2878. if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
  2879. ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
  2880. {
  2881. break; /* while */
  2882. }
  2883. }
  2884. }
  2885. if (IS_CHAR(cur)) {
  2886. COPY_BUF(l,buf,nbchar,cur);
  2887. } else {
  2888. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2889. "Invalid char in CDATA 0x%X\n", cur);
  2890. }
  2891. if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
  2892. buf[nbchar] = 0;
  2893. if (ctxt->sax->cdataBlock!= NULL) {
  2894. /*
  2895. * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
  2896. */
  2897. ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
  2898. } else if (ctxt->sax->characters != NULL) {
  2899. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2900. }
  2901. nbchar = 0;
  2902. }
  2903. GROW;
  2904. NEXTL(l);
  2905. cur = CUR_CHAR(l);
  2906. }
  2907. if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2908. buf[nbchar] = 0;
  2909. if (ctxt->sax->cdataBlock!= NULL) {
  2910. /*
  2911. * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
  2912. */
  2913. ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
  2914. } else if (ctxt->sax->characters != NULL) {
  2915. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2916. }
  2917. }
  2918. }
  2919. /**
  2920. * htmlParseCharDataInternal:
  2921. * @ctxt: an HTML parser context
  2922. * @readahead: optional read ahead character in ascii range
  2923. *
  2924. * parse a CharData section.
  2925. * if we are within a CDATA section ']]>' marks an end of section.
  2926. *
  2927. * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  2928. */
  2929. static void
  2930. htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
  2931. xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
  2932. int nbchar = 0;
  2933. int cur, l;
  2934. int chunk = 0;
  2935. if (readahead)
  2936. buf[nbchar++] = readahead;
  2937. SHRINK;
  2938. cur = CUR_CHAR(l);
  2939. while (((cur != '<') || (ctxt->token == '<')) &&
  2940. ((cur != '&') || (ctxt->token == '&')) &&
  2941. (cur != 0)) {
  2942. if (!(IS_CHAR(cur))) {
  2943. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2944. "Invalid char in CDATA 0x%X\n", cur);
  2945. } else {
  2946. COPY_BUF(l,buf,nbchar,cur);
  2947. }
  2948. if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
  2949. buf[nbchar] = 0;
  2950. /*
  2951. * Ok the segment is to be consumed as chars.
  2952. */
  2953. if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2954. if (areBlanks(ctxt, buf, nbchar)) {
  2955. if (ctxt->keepBlanks) {
  2956. if (ctxt->sax->characters != NULL)
  2957. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2958. } else {
  2959. if (ctxt->sax->ignorableWhitespace != NULL)
  2960. ctxt->sax->ignorableWhitespace(ctxt->userData,
  2961. buf, nbchar);
  2962. }
  2963. } else {
  2964. htmlCheckParagraph(ctxt);
  2965. if (ctxt->sax->characters != NULL)
  2966. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2967. }
  2968. }
  2969. nbchar = 0;
  2970. }
  2971. NEXTL(l);
  2972. chunk++;
  2973. if (chunk > HTML_PARSER_BUFFER_SIZE) {
  2974. chunk = 0;
  2975. SHRINK;
  2976. GROW;
  2977. }
  2978. cur = CUR_CHAR(l);
  2979. if (cur == 0) {
  2980. SHRINK;
  2981. GROW;
  2982. cur = CUR_CHAR(l);
  2983. }
  2984. }
  2985. if (nbchar != 0) {
  2986. buf[nbchar] = 0;
  2987. /*
  2988. * Ok the segment is to be consumed as chars.
  2989. */
  2990. if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2991. if (areBlanks(ctxt, buf, nbchar)) {
  2992. if (ctxt->keepBlanks) {
  2993. if (ctxt->sax->characters != NULL)
  2994. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2995. } else {
  2996. if (ctxt->sax->ignorableWhitespace != NULL)
  2997. ctxt->sax->ignorableWhitespace(ctxt->userData,
  2998. buf, nbchar);
  2999. }
  3000. } else {
  3001. htmlCheckParagraph(ctxt);
  3002. if (ctxt->sax->characters != NULL)
  3003. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  3004. }
  3005. }
  3006. } else {
  3007. /*
  3008. * Loop detection
  3009. */
  3010. if (cur == 0)
  3011. ctxt->instate = XML_PARSER_EOF;
  3012. }
  3013. }
  3014. /**
  3015. * htmlParseCharData:
  3016. * @ctxt: an HTML parser context
  3017. *
  3018. * parse a CharData section.
  3019. * if we are within a CDATA section ']]>' marks an end of section.
  3020. *
  3021. * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  3022. */
  3023. static void
  3024. htmlParseCharData(htmlParserCtxtPtr ctxt) {
  3025. htmlParseCharDataInternal(ctxt, 0);
  3026. }
  3027. /**
  3028. * htmlParseExternalID:
  3029. * @ctxt: an HTML parser context
  3030. * @publicID: a xmlChar** receiving PubidLiteral
  3031. *
  3032. * Parse an External ID or a Public ID
  3033. *
  3034. * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
  3035. * | 'PUBLIC' S PubidLiteral S SystemLiteral
  3036. *
  3037. * [83] PublicID ::= 'PUBLIC' S PubidLiteral
  3038. *
  3039. * Returns the function returns SystemLiteral and in the second
  3040. * case publicID receives PubidLiteral, is strict is off
  3041. * it is possible to return NULL and have publicID set.
  3042. */
  3043. static xmlChar *
  3044. htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
  3045. xmlChar *URI = NULL;
  3046. if ((UPPER == 'S') && (UPP(1) == 'Y') &&
  3047. (UPP(2) == 'S') && (UPP(3) == 'T') &&
  3048. (UPP(4) == 'E') && (UPP(5) == 'M')) {
  3049. SKIP(6);
  3050. if (!IS_BLANK_CH(CUR)) {
  3051. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  3052. "Space required after 'SYSTEM'\n", NULL, NULL);
  3053. }
  3054. SKIP_BLANKS;
  3055. URI = htmlParseSystemLiteral(ctxt);
  3056. if (URI == NULL) {
  3057. htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
  3058. "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
  3059. }
  3060. } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
  3061. (UPP(2) == 'B') && (UPP(3) == 'L') &&
  3062. (UPP(4) == 'I') && (UPP(5) == 'C')) {
  3063. SKIP(6);
  3064. if (!IS_BLANK_CH(CUR)) {
  3065. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  3066. "Space required after 'PUBLIC'\n", NULL, NULL);
  3067. }
  3068. SKIP_BLANKS;
  3069. *publicID = htmlParsePubidLiteral(ctxt);
  3070. if (*publicID == NULL) {
  3071. htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
  3072. "htmlParseExternalID: PUBLIC, no Public Identifier\n",
  3073. NULL, NULL);
  3074. }
  3075. SKIP_BLANKS;
  3076. if ((CUR == '"') || (CUR == '\'')) {
  3077. URI = htmlParseSystemLiteral(ctxt);
  3078. }
  3079. }
  3080. return(URI);
  3081. }
  3082. /**
  3083. * xmlParsePI:
  3084. * @ctxt: an XML parser context
  3085. *
  3086. * parse an XML Processing Instruction.
  3087. *
  3088. * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  3089. */
  3090. static void
  3091. htmlParsePI(htmlParserCtxtPtr ctxt) {
  3092. xmlChar *buf = NULL;
  3093. int len = 0;
  3094. int size = HTML_PARSER_BUFFER_SIZE;
  3095. int cur, l;
  3096. const xmlChar *target;
  3097. xmlParserInputState state;
  3098. int count = 0;
  3099. if ((RAW == '<') && (NXT(1) == '?')) {
  3100. state = ctxt->instate;
  3101. ctxt->instate = XML_PARSER_PI;
  3102. /*
  3103. * this is a Processing Instruction.
  3104. */
  3105. SKIP(2);
  3106. SHRINK;
  3107. /*
  3108. * Parse the target name and check for special support like
  3109. * namespace.
  3110. */
  3111. target = htmlParseName(ctxt);
  3112. if (target != NULL) {
  3113. if (RAW == '>') {
  3114. SKIP(1);
  3115. /*
  3116. * SAX: PI detected.
  3117. */
  3118. if ((ctxt->sax) && (!ctxt->disableSAX) &&
  3119. (ctxt->sax->processingInstruction != NULL))
  3120. ctxt->sax->processingInstruction(ctxt->userData,
  3121. target, NULL);
  3122. ctxt->instate = state;
  3123. return;
  3124. }
  3125. buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
  3126. if (buf == NULL) {
  3127. htmlErrMemory(ctxt, NULL);
  3128. ctxt->instate = state;
  3129. return;
  3130. }
  3131. cur = CUR;
  3132. if (!IS_BLANK(cur)) {
  3133. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  3134. "ParsePI: PI %s space expected\n", target, NULL);
  3135. }
  3136. SKIP_BLANKS;
  3137. cur = CUR_CHAR(l);
  3138. while ((cur != 0) && (cur != '>')) {
  3139. if (len + 5 >= size) {
  3140. xmlChar *tmp;
  3141. size *= 2;
  3142. tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
  3143. if (tmp == NULL) {
  3144. htmlErrMemory(ctxt, NULL);
  3145. xmlFree(buf);
  3146. ctxt->instate = state;
  3147. return;
  3148. }
  3149. buf = tmp;
  3150. }
  3151. count++;
  3152. if (count > 50) {
  3153. GROW;
  3154. count = 0;
  3155. }
  3156. if (IS_CHAR(cur)) {
  3157. COPY_BUF(l,buf,len,cur);
  3158. } else {
  3159. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  3160. "Invalid char in processing instruction "
  3161. "0x%X\n", cur);
  3162. }
  3163. NEXTL(l);
  3164. cur = CUR_CHAR(l);
  3165. if (cur == 0) {
  3166. SHRINK;
  3167. GROW;
  3168. cur = CUR_CHAR(l);
  3169. }
  3170. }
  3171. buf[len] = 0;
  3172. if (cur != '>') {
  3173. htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
  3174. "ParsePI: PI %s never end ...\n", target, NULL);
  3175. } else {
  3176. SKIP(1);
  3177. /*
  3178. * SAX: PI detected.
  3179. */
  3180. if ((ctxt->sax) && (!ctxt->disableSAX) &&
  3181. (ctxt->sax->processingInstruction != NULL))
  3182. ctxt->sax->processingInstruction(ctxt->userData,
  3183. target, buf);
  3184. }
  3185. xmlFree(buf);
  3186. } else {
  3187. htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
  3188. "PI is not started correctly", NULL, NULL);
  3189. }
  3190. ctxt->instate = state;
  3191. }
  3192. }
  3193. /**
  3194. * htmlParseComment:
  3195. * @ctxt: an HTML parser context
  3196. *
  3197. * Parse an XML (SGML) comment <!-- .... -->
  3198. *
  3199. * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
  3200. */
  3201. static void
  3202. htmlParseComment(htmlParserCtxtPtr ctxt) {
  3203. xmlChar *buf = NULL;
  3204. int len;
  3205. int size = HTML_PARSER_BUFFER_SIZE;
  3206. int q, ql;
  3207. int r, rl;
  3208. int cur, l;
  3209. int next, nl;
  3210. xmlParserInputState state;
  3211. /*
  3212. * Check that there is a comment right here.
  3213. */
  3214. if ((RAW != '<') || (NXT(1) != '!') ||
  3215. (NXT(2) != '-') || (NXT(3) != '-')) return;
  3216. state = ctxt->instate;
  3217. ctxt->instate = XML_PARSER_COMMENT;
  3218. SHRINK;
  3219. SKIP(4);
  3220. buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
  3221. if (buf == NULL) {
  3222. htmlErrMemory(ctxt, "buffer allocation failed\n");
  3223. ctxt->instate = state;
  3224. return;
  3225. }
  3226. len = 0;
  3227. buf[len] = 0;
  3228. q = CUR_CHAR(ql);
  3229. if (q == 0)
  3230. goto unfinished;
  3231. NEXTL(ql);
  3232. r = CUR_CHAR(rl);
  3233. if (r == 0)
  3234. goto unfinished;
  3235. NEXTL(rl);
  3236. cur = CUR_CHAR(l);
  3237. while ((cur != 0) &&
  3238. ((cur != '>') ||
  3239. (r != '-') || (q != '-'))) {
  3240. NEXTL(l);
  3241. next = CUR_CHAR(nl);
  3242. if (next == 0) {
  3243. SHRINK;
  3244. GROW;
  3245. next = CUR_CHAR(nl);
  3246. }
  3247. if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
  3248. htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
  3249. "Comment incorrectly closed by '--!>'", NULL, NULL);
  3250. cur = '>';
  3251. break;
  3252. }
  3253. if (len + 5 >= size) {
  3254. xmlChar *tmp;
  3255. size *= 2;
  3256. tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
  3257. if (tmp == NULL) {
  3258. xmlFree(buf);
  3259. htmlErrMemory(ctxt, "growing buffer failed\n");
  3260. ctxt->instate = state;
  3261. return;
  3262. }
  3263. buf = tmp;
  3264. }
  3265. if (IS_CHAR(q)) {
  3266. COPY_BUF(ql,buf,len,q);
  3267. } else {
  3268. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  3269. "Invalid char in comment 0x%X\n", q);
  3270. }
  3271. q = r;
  3272. ql = rl;
  3273. r = cur;
  3274. rl = l;
  3275. cur = next;
  3276. l = nl;
  3277. }
  3278. buf[len] = 0;
  3279. if (cur == '>') {
  3280. NEXT;
  3281. if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
  3282. (!ctxt->disableSAX))
  3283. ctxt->sax->comment(ctxt->userData, buf);
  3284. xmlFree(buf);
  3285. ctxt->instate = state;
  3286. return;
  3287. }
  3288. unfinished:
  3289. htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
  3290. "Comment not terminated \n<!--%.50s\n", buf, NULL);
  3291. xmlFree(buf);
  3292. }
  3293. /**
  3294. * htmlParseCharRef:
  3295. * @ctxt: an HTML parser context
  3296. *
  3297. * parse Reference declarations
  3298. *
  3299. * [66] CharRef ::= '&#' [0-9]+ ';' |
  3300. * '&#x' [0-9a-fA-F]+ ';'
  3301. *
  3302. * Returns the value parsed (as an int)
  3303. */
  3304. int
  3305. htmlParseCharRef(htmlParserCtxtPtr ctxt) {
  3306. int val = 0;
  3307. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3308. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3309. "htmlParseCharRef: context error\n",
  3310. NULL, NULL);
  3311. return(0);
  3312. }
  3313. if ((CUR == '&') && (NXT(1) == '#') &&
  3314. ((NXT(2) == 'x') || NXT(2) == 'X')) {
  3315. SKIP(3);
  3316. while (CUR != ';') {
  3317. if ((CUR >= '0') && (CUR <= '9')) {
  3318. if (val < 0x110000)
  3319. val = val * 16 + (CUR - '0');
  3320. } else if ((CUR >= 'a') && (CUR <= 'f')) {
  3321. if (val < 0x110000)
  3322. val = val * 16 + (CUR - 'a') + 10;
  3323. } else if ((CUR >= 'A') && (CUR <= 'F')) {
  3324. if (val < 0x110000)
  3325. val = val * 16 + (CUR - 'A') + 10;
  3326. } else {
  3327. htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
  3328. "htmlParseCharRef: missing semicolon\n",
  3329. NULL, NULL);
  3330. break;
  3331. }
  3332. NEXT;
  3333. }
  3334. if (CUR == ';')
  3335. NEXT;
  3336. } else if ((CUR == '&') && (NXT(1) == '#')) {
  3337. SKIP(2);
  3338. while (CUR != ';') {
  3339. if ((CUR >= '0') && (CUR <= '9')) {
  3340. if (val < 0x110000)
  3341. val = val * 10 + (CUR - '0');
  3342. } else {
  3343. htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
  3344. "htmlParseCharRef: missing semicolon\n",
  3345. NULL, NULL);
  3346. break;
  3347. }
  3348. NEXT;
  3349. }
  3350. if (CUR == ';')
  3351. NEXT;
  3352. } else {
  3353. htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
  3354. "htmlParseCharRef: invalid value\n", NULL, NULL);
  3355. }
  3356. /*
  3357. * Check the value IS_CHAR ...
  3358. */
  3359. if (IS_CHAR(val)) {
  3360. return(val);
  3361. } else if (val >= 0x110000) {
  3362. htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
  3363. "htmlParseCharRef: value too large\n", NULL, NULL);
  3364. } else {
  3365. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  3366. "htmlParseCharRef: invalid xmlChar value %d\n",
  3367. val);
  3368. }
  3369. return(0);
  3370. }
  3371. /**
  3372. * htmlParseDocTypeDecl:
  3373. * @ctxt: an HTML parser context
  3374. *
  3375. * parse a DOCTYPE declaration
  3376. *
  3377. * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
  3378. * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
  3379. */
  3380. static void
  3381. htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
  3382. const xmlChar *name;
  3383. xmlChar *ExternalID = NULL;
  3384. xmlChar *URI = NULL;
  3385. /*
  3386. * We know that '<!DOCTYPE' has been detected.
  3387. */
  3388. SKIP(9);
  3389. SKIP_BLANKS;
  3390. /*
  3391. * Parse the DOCTYPE name.
  3392. */
  3393. name = htmlParseName(ctxt);
  3394. if (name == NULL) {
  3395. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3396. "htmlParseDocTypeDecl : no DOCTYPE name !\n",
  3397. NULL, NULL);
  3398. }
  3399. /*
  3400. * Check that upper(name) == "HTML" !!!!!!!!!!!!!
  3401. */
  3402. SKIP_BLANKS;
  3403. /*
  3404. * Check for SystemID and ExternalID
  3405. */
  3406. URI = htmlParseExternalID(ctxt, &ExternalID);
  3407. SKIP_BLANKS;
  3408. /*
  3409. * We should be at the end of the DOCTYPE declaration.
  3410. */
  3411. if (CUR != '>') {
  3412. htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
  3413. "DOCTYPE improperly terminated\n", NULL, NULL);
  3414. /* Ignore bogus content */
  3415. while ((CUR != 0) && (CUR != '>'))
  3416. NEXT;
  3417. }
  3418. if (CUR == '>')
  3419. NEXT;
  3420. /*
  3421. * Create or update the document accordingly to the DOCTYPE
  3422. */
  3423. if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
  3424. (!ctxt->disableSAX))
  3425. ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
  3426. /*
  3427. * Cleanup, since we don't use all those identifiers
  3428. */
  3429. if (URI != NULL) xmlFree(URI);
  3430. if (ExternalID != NULL) xmlFree(ExternalID);
  3431. }
  3432. /**
  3433. * htmlParseAttribute:
  3434. * @ctxt: an HTML parser context
  3435. * @value: a xmlChar ** used to store the value of the attribute
  3436. *
  3437. * parse an attribute
  3438. *
  3439. * [41] Attribute ::= Name Eq AttValue
  3440. *
  3441. * [25] Eq ::= S? '=' S?
  3442. *
  3443. * With namespace:
  3444. *
  3445. * [NS 11] Attribute ::= QName Eq AttValue
  3446. *
  3447. * Also the case QName == xmlns:??? is handled independently as a namespace
  3448. * definition.
  3449. *
  3450. * Returns the attribute name, and the value in *value.
  3451. */
  3452. static const xmlChar *
  3453. htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
  3454. const xmlChar *name;
  3455. xmlChar *val = NULL;
  3456. *value = NULL;
  3457. name = htmlParseHTMLName(ctxt);
  3458. if (name == NULL) {
  3459. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3460. "error parsing attribute name\n", NULL, NULL);
  3461. return(NULL);
  3462. }
  3463. /*
  3464. * read the value
  3465. */
  3466. SKIP_BLANKS;
  3467. if (CUR == '=') {
  3468. NEXT;
  3469. SKIP_BLANKS;
  3470. val = htmlParseAttValue(ctxt);
  3471. }
  3472. *value = val;
  3473. return(name);
  3474. }
  3475. /**
  3476. * htmlCheckEncodingDirect:
  3477. * @ctxt: an HTML parser context
  3478. * @attvalue: the attribute value
  3479. *
  3480. * Checks an attribute value to detect
  3481. * the encoding
  3482. * If a new encoding is detected the parser is switched to decode
  3483. * it and pass UTF8
  3484. */
  3485. static void
  3486. htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
  3487. if ((ctxt == NULL) || (encoding == NULL) ||
  3488. (ctxt->options & HTML_PARSE_IGNORE_ENC))
  3489. return;
  3490. /* do not change encoding */
  3491. if (ctxt->input->encoding != NULL)
  3492. return;
  3493. if (encoding != NULL) {
  3494. xmlCharEncoding enc;
  3495. xmlCharEncodingHandlerPtr handler;
  3496. while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
  3497. if (ctxt->input->encoding != NULL)
  3498. xmlFree((xmlChar *) ctxt->input->encoding);
  3499. ctxt->input->encoding = xmlStrdup(encoding);
  3500. enc = xmlParseCharEncoding((const char *) encoding);
  3501. /*
  3502. * registered set of known encodings
  3503. */
  3504. if (enc != XML_CHAR_ENCODING_ERROR) {
  3505. if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
  3506. (enc == XML_CHAR_ENCODING_UTF16BE) ||
  3507. (enc == XML_CHAR_ENCODING_UCS4LE) ||
  3508. (enc == XML_CHAR_ENCODING_UCS4BE)) &&
  3509. (ctxt->input->buf != NULL) &&
  3510. (ctxt->input->buf->encoder == NULL)) {
  3511. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  3512. "htmlCheckEncoding: wrong encoding meta\n",
  3513. NULL, NULL);
  3514. } else {
  3515. xmlSwitchEncoding(ctxt, enc);
  3516. }
  3517. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  3518. } else {
  3519. /*
  3520. * fallback for unknown encodings
  3521. */
  3522. handler = xmlFindCharEncodingHandler((const char *) encoding);
  3523. if (handler != NULL) {
  3524. xmlSwitchToEncoding(ctxt, handler);
  3525. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  3526. } else {
  3527. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  3528. "htmlCheckEncoding: unknown encoding %s\n",
  3529. encoding, NULL);
  3530. }
  3531. }
  3532. if ((ctxt->input->buf != NULL) &&
  3533. (ctxt->input->buf->encoder != NULL) &&
  3534. (ctxt->input->buf->raw != NULL) &&
  3535. (ctxt->input->buf->buffer != NULL)) {
  3536. int nbchars;
  3537. int processed;
  3538. /*
  3539. * convert as much as possible to the parser reading buffer.
  3540. */
  3541. processed = ctxt->input->cur - ctxt->input->base;
  3542. xmlBufShrink(ctxt->input->buf->buffer, processed);
  3543. nbchars = xmlCharEncInput(ctxt->input->buf, 1);
  3544. xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
  3545. if (nbchars < 0) {
  3546. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  3547. "htmlCheckEncoding: encoder error\n",
  3548. NULL, NULL);
  3549. }
  3550. }
  3551. }
  3552. }
  3553. /**
  3554. * htmlCheckEncoding:
  3555. * @ctxt: an HTML parser context
  3556. * @attvalue: the attribute value
  3557. *
  3558. * Checks an http-equiv attribute from a Meta tag to detect
  3559. * the encoding
  3560. * If a new encoding is detected the parser is switched to decode
  3561. * it and pass UTF8
  3562. */
  3563. static void
  3564. htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
  3565. const xmlChar *encoding;
  3566. if (!attvalue)
  3567. return;
  3568. encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
  3569. if (encoding != NULL) {
  3570. encoding += 7;
  3571. }
  3572. /*
  3573. * skip blank
  3574. */
  3575. if (encoding && IS_BLANK_CH(*encoding))
  3576. encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
  3577. if (encoding && *encoding == '=') {
  3578. encoding ++;
  3579. htmlCheckEncodingDirect(ctxt, encoding);
  3580. }
  3581. }
  3582. /**
  3583. * htmlCheckMeta:
  3584. * @ctxt: an HTML parser context
  3585. * @atts: the attributes values
  3586. *
  3587. * Checks an attributes from a Meta tag
  3588. */
  3589. static void
  3590. htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
  3591. int i;
  3592. const xmlChar *att, *value;
  3593. int http = 0;
  3594. const xmlChar *content = NULL;
  3595. if ((ctxt == NULL) || (atts == NULL))
  3596. return;
  3597. i = 0;
  3598. att = atts[i++];
  3599. while (att != NULL) {
  3600. value = atts[i++];
  3601. if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
  3602. && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
  3603. http = 1;
  3604. else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
  3605. htmlCheckEncodingDirect(ctxt, value);
  3606. else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
  3607. content = value;
  3608. att = atts[i++];
  3609. }
  3610. if ((http) && (content != NULL))
  3611. htmlCheckEncoding(ctxt, content);
  3612. }
  3613. /**
  3614. * htmlParseStartTag:
  3615. * @ctxt: an HTML parser context
  3616. *
  3617. * parse a start of tag either for rule element or
  3618. * EmptyElement. In both case we don't parse the tag closing chars.
  3619. *
  3620. * [40] STag ::= '<' Name (S Attribute)* S? '>'
  3621. *
  3622. * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
  3623. *
  3624. * With namespace:
  3625. *
  3626. * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
  3627. *
  3628. * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
  3629. *
  3630. * Returns 0 in case of success, -1 in case of error and 1 if discarded
  3631. */
  3632. static int
  3633. htmlParseStartTag(htmlParserCtxtPtr ctxt) {
  3634. const xmlChar *name;
  3635. const xmlChar *attname;
  3636. xmlChar *attvalue;
  3637. const xmlChar **atts;
  3638. int nbatts = 0;
  3639. int maxatts;
  3640. int meta = 0;
  3641. int i;
  3642. int discardtag = 0;
  3643. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3644. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3645. "htmlParseStartTag: context error\n", NULL, NULL);
  3646. return -1;
  3647. }
  3648. if (ctxt->instate == XML_PARSER_EOF)
  3649. return(-1);
  3650. if (CUR != '<') return -1;
  3651. NEXT;
  3652. atts = ctxt->atts;
  3653. maxatts = ctxt->maxatts;
  3654. GROW;
  3655. name = htmlParseHTMLName(ctxt);
  3656. if (name == NULL) {
  3657. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3658. "htmlParseStartTag: invalid element name\n",
  3659. NULL, NULL);
  3660. /*
  3661. * The recovery code is disabled for now as it can result in
  3662. * quadratic behavior with the push parser. htmlParseStartTag
  3663. * must consume all content up to the final '>' in order to avoid
  3664. * rescanning for this terminator.
  3665. *
  3666. * For a proper fix in line with HTML5, htmlParseStartTag and
  3667. * htmlParseElement should only be called when there's an ASCII
  3668. * alpha character following the initial '<'. Otherwise, the '<'
  3669. * should be emitted as text (unless followed by '!', '/' or '?').
  3670. */
  3671. #if 0
  3672. /* if recover preserve text on classic misconstructs */
  3673. if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
  3674. (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
  3675. htmlParseCharDataInternal(ctxt, '<');
  3676. return(-1);
  3677. }
  3678. #endif
  3679. /* Dump the bogus tag like browsers do */
  3680. while ((CUR != 0) && (CUR != '>') &&
  3681. (ctxt->instate != XML_PARSER_EOF))
  3682. NEXT;
  3683. return -1;
  3684. }
  3685. if (xmlStrEqual(name, BAD_CAST"meta"))
  3686. meta = 1;
  3687. /*
  3688. * Check for auto-closure of HTML elements.
  3689. */
  3690. htmlAutoClose(ctxt, name);
  3691. /*
  3692. * Check for implied HTML elements.
  3693. */
  3694. htmlCheckImplied(ctxt, name);
  3695. /*
  3696. * Avoid html at any level > 0, head at any level != 1
  3697. * or any attempt to recurse body
  3698. */
  3699. if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
  3700. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3701. "htmlParseStartTag: misplaced <html> tag\n",
  3702. name, NULL);
  3703. discardtag = 1;
  3704. ctxt->depth++;
  3705. }
  3706. if ((ctxt->nameNr != 1) &&
  3707. (xmlStrEqual(name, BAD_CAST"head"))) {
  3708. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3709. "htmlParseStartTag: misplaced <head> tag\n",
  3710. name, NULL);
  3711. discardtag = 1;
  3712. ctxt->depth++;
  3713. }
  3714. if (xmlStrEqual(name, BAD_CAST"body")) {
  3715. int indx;
  3716. for (indx = 0;indx < ctxt->nameNr;indx++) {
  3717. if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
  3718. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3719. "htmlParseStartTag: misplaced <body> tag\n",
  3720. name, NULL);
  3721. discardtag = 1;
  3722. ctxt->depth++;
  3723. }
  3724. }
  3725. }
  3726. /*
  3727. * Now parse the attributes, it ends up with the ending
  3728. *
  3729. * (S Attribute)* S?
  3730. */
  3731. SKIP_BLANKS;
  3732. while ((CUR != 0) &&
  3733. (CUR != '>') &&
  3734. ((CUR != '/') || (NXT(1) != '>'))) {
  3735. GROW;
  3736. attname = htmlParseAttribute(ctxt, &attvalue);
  3737. if (attname != NULL) {
  3738. /*
  3739. * Well formedness requires at most one declaration of an attribute
  3740. */
  3741. for (i = 0; i < nbatts;i += 2) {
  3742. if (xmlStrEqual(atts[i], attname)) {
  3743. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
  3744. "Attribute %s redefined\n", attname, NULL);
  3745. if (attvalue != NULL)
  3746. xmlFree(attvalue);
  3747. goto failed;
  3748. }
  3749. }
  3750. /*
  3751. * Add the pair to atts
  3752. */
  3753. if (atts == NULL) {
  3754. maxatts = 22; /* allow for 10 attrs by default */
  3755. atts = (const xmlChar **)
  3756. xmlMalloc(maxatts * sizeof(xmlChar *));
  3757. if (atts == NULL) {
  3758. htmlErrMemory(ctxt, NULL);
  3759. if (attvalue != NULL)
  3760. xmlFree(attvalue);
  3761. goto failed;
  3762. }
  3763. ctxt->atts = atts;
  3764. ctxt->maxatts = maxatts;
  3765. } else if (nbatts + 4 > maxatts) {
  3766. const xmlChar **n;
  3767. maxatts *= 2;
  3768. n = (const xmlChar **) xmlRealloc((void *) atts,
  3769. maxatts * sizeof(const xmlChar *));
  3770. if (n == NULL) {
  3771. htmlErrMemory(ctxt, NULL);
  3772. if (attvalue != NULL)
  3773. xmlFree(attvalue);
  3774. goto failed;
  3775. }
  3776. atts = n;
  3777. ctxt->atts = atts;
  3778. ctxt->maxatts = maxatts;
  3779. }
  3780. atts[nbatts++] = attname;
  3781. atts[nbatts++] = attvalue;
  3782. atts[nbatts] = NULL;
  3783. atts[nbatts + 1] = NULL;
  3784. }
  3785. else {
  3786. if (attvalue != NULL)
  3787. xmlFree(attvalue);
  3788. /* Dump the bogus attribute string up to the next blank or
  3789. * the end of the tag. */
  3790. while ((CUR != 0) &&
  3791. !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
  3792. ((CUR != '/') || (NXT(1) != '>')))
  3793. NEXT;
  3794. }
  3795. failed:
  3796. SKIP_BLANKS;
  3797. }
  3798. /*
  3799. * Handle specific association to the META tag
  3800. */
  3801. if (meta && (nbatts != 0))
  3802. htmlCheckMeta(ctxt, atts);
  3803. /*
  3804. * SAX: Start of Element !
  3805. */
  3806. if (!discardtag) {
  3807. htmlnamePush(ctxt, name);
  3808. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
  3809. if (nbatts != 0)
  3810. ctxt->sax->startElement(ctxt->userData, name, atts);
  3811. else
  3812. ctxt->sax->startElement(ctxt->userData, name, NULL);
  3813. }
  3814. }
  3815. if (atts != NULL) {
  3816. for (i = 1;i < nbatts;i += 2) {
  3817. if (atts[i] != NULL)
  3818. xmlFree((xmlChar *) atts[i]);
  3819. }
  3820. }
  3821. return(discardtag);
  3822. }
  3823. /**
  3824. * htmlParseEndTag:
  3825. * @ctxt: an HTML parser context
  3826. *
  3827. * parse an end of tag
  3828. *
  3829. * [42] ETag ::= '</' Name S? '>'
  3830. *
  3831. * With namespace
  3832. *
  3833. * [NS 9] ETag ::= '</' QName S? '>'
  3834. *
  3835. * Returns 1 if the current level should be closed.
  3836. */
  3837. static int
  3838. htmlParseEndTag(htmlParserCtxtPtr ctxt)
  3839. {
  3840. const xmlChar *name;
  3841. const xmlChar *oldname;
  3842. int i, ret;
  3843. if ((CUR != '<') || (NXT(1) != '/')) {
  3844. htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
  3845. "htmlParseEndTag: '</' not found\n", NULL, NULL);
  3846. return (0);
  3847. }
  3848. SKIP(2);
  3849. name = htmlParseHTMLName(ctxt);
  3850. if (name == NULL)
  3851. return (0);
  3852. /*
  3853. * We should definitely be at the ending "S? '>'" part
  3854. */
  3855. SKIP_BLANKS;
  3856. if (CUR != '>') {
  3857. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  3858. "End tag : expected '>'\n", NULL, NULL);
  3859. /* Skip to next '>' */
  3860. while ((CUR != 0) && (CUR != '>'))
  3861. NEXT;
  3862. }
  3863. if (CUR == '>')
  3864. NEXT;
  3865. /*
  3866. * if we ignored misplaced tags in htmlParseStartTag don't pop them
  3867. * out now.
  3868. */
  3869. if ((ctxt->depth > 0) &&
  3870. (xmlStrEqual(name, BAD_CAST "html") ||
  3871. xmlStrEqual(name, BAD_CAST "body") ||
  3872. xmlStrEqual(name, BAD_CAST "head"))) {
  3873. ctxt->depth--;
  3874. return (0);
  3875. }
  3876. /*
  3877. * If the name read is not one of the element in the parsing stack
  3878. * then return, it's just an error.
  3879. */
  3880. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  3881. if (xmlStrEqual(name, ctxt->nameTab[i]))
  3882. break;
  3883. }
  3884. if (i < 0) {
  3885. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  3886. "Unexpected end tag : %s\n", name, NULL);
  3887. return (0);
  3888. }
  3889. /*
  3890. * Check for auto-closure of HTML elements.
  3891. */
  3892. htmlAutoCloseOnClose(ctxt, name);
  3893. /*
  3894. * Well formedness constraints, opening and closing must match.
  3895. * With the exception that the autoclose may have popped stuff out
  3896. * of the stack.
  3897. */
  3898. if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
  3899. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  3900. "Opening and ending tag mismatch: %s and %s\n",
  3901. name, ctxt->name);
  3902. }
  3903. /*
  3904. * SAX: End of Tag
  3905. */
  3906. oldname = ctxt->name;
  3907. if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
  3908. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3909. ctxt->sax->endElement(ctxt->userData, name);
  3910. htmlNodeInfoPop(ctxt);
  3911. htmlnamePop(ctxt);
  3912. ret = 1;
  3913. } else {
  3914. ret = 0;
  3915. }
  3916. return (ret);
  3917. }
  3918. /**
  3919. * htmlParseReference:
  3920. * @ctxt: an HTML parser context
  3921. *
  3922. * parse and handle entity references in content,
  3923. * this will end-up in a call to character() since this is either a
  3924. * CharRef, or a predefined entity.
  3925. */
  3926. static void
  3927. htmlParseReference(htmlParserCtxtPtr ctxt) {
  3928. const htmlEntityDesc * ent;
  3929. xmlChar out[6];
  3930. const xmlChar *name;
  3931. if (CUR != '&') return;
  3932. if (NXT(1) == '#') {
  3933. unsigned int c;
  3934. int bits, i = 0;
  3935. c = htmlParseCharRef(ctxt);
  3936. if (c == 0)
  3937. return;
  3938. if (c < 0x80) { out[i++]= c; bits= -6; }
  3939. else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  3940. else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  3941. else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
  3942. for ( ; bits >= 0; bits-= 6) {
  3943. out[i++]= ((c >> bits) & 0x3F) | 0x80;
  3944. }
  3945. out[i] = 0;
  3946. htmlCheckParagraph(ctxt);
  3947. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3948. ctxt->sax->characters(ctxt->userData, out, i);
  3949. } else {
  3950. ent = htmlParseEntityRef(ctxt, &name);
  3951. if (name == NULL) {
  3952. htmlCheckParagraph(ctxt);
  3953. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3954. ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
  3955. return;
  3956. }
  3957. if ((ent == NULL) || !(ent->value > 0)) {
  3958. htmlCheckParagraph(ctxt);
  3959. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
  3960. ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
  3961. ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
  3962. /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
  3963. }
  3964. } else {
  3965. unsigned int c;
  3966. int bits, i = 0;
  3967. c = ent->value;
  3968. if (c < 0x80)
  3969. { out[i++]= c; bits= -6; }
  3970. else if (c < 0x800)
  3971. { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  3972. else if (c < 0x10000)
  3973. { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  3974. else
  3975. { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
  3976. for ( ; bits >= 0; bits-= 6) {
  3977. out[i++]= ((c >> bits) & 0x3F) | 0x80;
  3978. }
  3979. out[i] = 0;
  3980. htmlCheckParagraph(ctxt);
  3981. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3982. ctxt->sax->characters(ctxt->userData, out, i);
  3983. }
  3984. }
  3985. }
  3986. /**
  3987. * htmlParseContent:
  3988. * @ctxt: an HTML parser context
  3989. *
  3990. * Parse a content: comment, sub-element, reference or text.
  3991. * Kept for compatibility with old code
  3992. */
  3993. static void
  3994. htmlParseContent(htmlParserCtxtPtr ctxt) {
  3995. xmlChar *currentNode;
  3996. int depth;
  3997. const xmlChar *name;
  3998. currentNode = xmlStrdup(ctxt->name);
  3999. depth = ctxt->nameNr;
  4000. while (1) {
  4001. GROW;
  4002. if (ctxt->instate == XML_PARSER_EOF)
  4003. break;
  4004. /*
  4005. * Our tag or one of it's parent or children is ending.
  4006. */
  4007. if ((CUR == '<') && (NXT(1) == '/')) {
  4008. if (htmlParseEndTag(ctxt) &&
  4009. ((currentNode != NULL) || (ctxt->nameNr == 0))) {
  4010. if (currentNode != NULL)
  4011. xmlFree(currentNode);
  4012. return;
  4013. }
  4014. continue; /* while */
  4015. }
  4016. else if ((CUR == '<') &&
  4017. ((IS_ASCII_LETTER(NXT(1))) ||
  4018. (NXT(1) == '_') || (NXT(1) == ':'))) {
  4019. name = htmlParseHTMLName_nonInvasive(ctxt);
  4020. if (name == NULL) {
  4021. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  4022. "htmlParseStartTag: invalid element name\n",
  4023. NULL, NULL);
  4024. /* Dump the bogus tag like browsers do */
  4025. while ((CUR != 0) && (CUR != '>'))
  4026. NEXT;
  4027. if (currentNode != NULL)
  4028. xmlFree(currentNode);
  4029. return;
  4030. }
  4031. if (ctxt->name != NULL) {
  4032. if (htmlCheckAutoClose(name, ctxt->name) == 1) {
  4033. htmlAutoClose(ctxt, name);
  4034. continue;
  4035. }
  4036. }
  4037. }
  4038. /*
  4039. * Has this node been popped out during parsing of
  4040. * the next element
  4041. */
  4042. if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
  4043. (!xmlStrEqual(currentNode, ctxt->name)))
  4044. {
  4045. if (currentNode != NULL) xmlFree(currentNode);
  4046. return;
  4047. }
  4048. if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
  4049. (xmlStrEqual(currentNode, BAD_CAST"style")))) {
  4050. /*
  4051. * Handle SCRIPT/STYLE separately
  4052. */
  4053. htmlParseScript(ctxt);
  4054. } else {
  4055. /*
  4056. * Sometimes DOCTYPE arrives in the middle of the document
  4057. */
  4058. if ((CUR == '<') && (NXT(1) == '!') &&
  4059. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4060. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4061. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4062. (UPP(8) == 'E')) {
  4063. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  4064. "Misplaced DOCTYPE declaration\n",
  4065. BAD_CAST "DOCTYPE" , NULL);
  4066. htmlParseDocTypeDecl(ctxt);
  4067. }
  4068. /*
  4069. * First case : a comment
  4070. */
  4071. if ((CUR == '<') && (NXT(1) == '!') &&
  4072. (NXT(2) == '-') && (NXT(3) == '-')) {
  4073. htmlParseComment(ctxt);
  4074. }
  4075. /*
  4076. * Second case : a Processing Instruction.
  4077. */
  4078. else if ((CUR == '<') && (NXT(1) == '?')) {
  4079. htmlParsePI(ctxt);
  4080. }
  4081. /*
  4082. * Third case : a sub-element.
  4083. */
  4084. else if (CUR == '<') {
  4085. htmlParseElement(ctxt);
  4086. }
  4087. /*
  4088. * Fourth case : a reference. If if has not been resolved,
  4089. * parsing returns it's Name, create the node
  4090. */
  4091. else if (CUR == '&') {
  4092. htmlParseReference(ctxt);
  4093. }
  4094. /*
  4095. * Fifth case : end of the resource
  4096. */
  4097. else if (CUR == 0) {
  4098. htmlAutoCloseOnEnd(ctxt);
  4099. break;
  4100. }
  4101. /*
  4102. * Last case, text. Note that References are handled directly.
  4103. */
  4104. else {
  4105. htmlParseCharData(ctxt);
  4106. }
  4107. }
  4108. GROW;
  4109. }
  4110. if (currentNode != NULL) xmlFree(currentNode);
  4111. }
  4112. /**
  4113. * htmlParseElement:
  4114. * @ctxt: an HTML parser context
  4115. *
  4116. * parse an HTML element, this is highly recursive
  4117. * this is kept for compatibility with previous code versions
  4118. *
  4119. * [39] element ::= EmptyElemTag | STag content ETag
  4120. *
  4121. * [41] Attribute ::= Name Eq AttValue
  4122. */
  4123. void
  4124. htmlParseElement(htmlParserCtxtPtr ctxt) {
  4125. const xmlChar *name;
  4126. xmlChar *currentNode = NULL;
  4127. const htmlElemDesc * info;
  4128. htmlParserNodeInfo node_info;
  4129. int failed;
  4130. int depth;
  4131. const xmlChar *oldptr;
  4132. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  4133. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4134. "htmlParseElement: context error\n", NULL, NULL);
  4135. return;
  4136. }
  4137. if (ctxt->instate == XML_PARSER_EOF)
  4138. return;
  4139. /* Capture start position */
  4140. if (ctxt->record_info) {
  4141. node_info.begin_pos = ctxt->input->consumed +
  4142. (CUR_PTR - ctxt->input->base);
  4143. node_info.begin_line = ctxt->input->line;
  4144. }
  4145. failed = htmlParseStartTag(ctxt);
  4146. name = ctxt->name;
  4147. if ((failed == -1) || (name == NULL)) {
  4148. if (CUR == '>')
  4149. NEXT;
  4150. return;
  4151. }
  4152. /*
  4153. * Lookup the info for that element.
  4154. */
  4155. info = htmlTagLookup(name);
  4156. if (info == NULL) {
  4157. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  4158. "Tag %s invalid\n", name, NULL);
  4159. }
  4160. /*
  4161. * Check for an Empty Element labeled the XML/SGML way
  4162. */
  4163. if ((CUR == '/') && (NXT(1) == '>')) {
  4164. SKIP(2);
  4165. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4166. ctxt->sax->endElement(ctxt->userData, name);
  4167. htmlnamePop(ctxt);
  4168. return;
  4169. }
  4170. if (CUR == '>') {
  4171. NEXT;
  4172. } else {
  4173. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  4174. "Couldn't find end of Start Tag %s\n", name, NULL);
  4175. /*
  4176. * end of parsing of this node.
  4177. */
  4178. if (xmlStrEqual(name, ctxt->name)) {
  4179. nodePop(ctxt);
  4180. htmlnamePop(ctxt);
  4181. }
  4182. /*
  4183. * Capture end position and add node
  4184. */
  4185. if (ctxt->record_info) {
  4186. node_info.end_pos = ctxt->input->consumed +
  4187. (CUR_PTR - ctxt->input->base);
  4188. node_info.end_line = ctxt->input->line;
  4189. node_info.node = ctxt->node;
  4190. xmlParserAddNodeInfo(ctxt, &node_info);
  4191. }
  4192. return;
  4193. }
  4194. /*
  4195. * Check for an Empty Element from DTD definition
  4196. */
  4197. if ((info != NULL) && (info->empty)) {
  4198. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4199. ctxt->sax->endElement(ctxt->userData, name);
  4200. htmlnamePop(ctxt);
  4201. return;
  4202. }
  4203. /*
  4204. * Parse the content of the element:
  4205. */
  4206. currentNode = xmlStrdup(ctxt->name);
  4207. depth = ctxt->nameNr;
  4208. while (CUR != 0) {
  4209. oldptr = ctxt->input->cur;
  4210. htmlParseContent(ctxt);
  4211. if (oldptr==ctxt->input->cur) break;
  4212. if (ctxt->nameNr < depth) break;
  4213. }
  4214. /*
  4215. * Capture end position and add node
  4216. */
  4217. if ( currentNode != NULL && ctxt->record_info ) {
  4218. node_info.end_pos = ctxt->input->consumed +
  4219. (CUR_PTR - ctxt->input->base);
  4220. node_info.end_line = ctxt->input->line;
  4221. node_info.node = ctxt->node;
  4222. xmlParserAddNodeInfo(ctxt, &node_info);
  4223. }
  4224. if (CUR == 0) {
  4225. htmlAutoCloseOnEnd(ctxt);
  4226. }
  4227. if (currentNode != NULL)
  4228. xmlFree(currentNode);
  4229. }
  4230. static void
  4231. htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
  4232. /*
  4233. * Capture end position and add node
  4234. */
  4235. if ( ctxt->node != NULL && ctxt->record_info ) {
  4236. ctxt->nodeInfo->end_pos = ctxt->input->consumed +
  4237. (CUR_PTR - ctxt->input->base);
  4238. ctxt->nodeInfo->end_line = ctxt->input->line;
  4239. ctxt->nodeInfo->node = ctxt->node;
  4240. xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
  4241. htmlNodeInfoPop(ctxt);
  4242. }
  4243. if (CUR == 0) {
  4244. htmlAutoCloseOnEnd(ctxt);
  4245. }
  4246. }
  4247. /**
  4248. * htmlParseElementInternal:
  4249. * @ctxt: an HTML parser context
  4250. *
  4251. * parse an HTML element, new version, non recursive
  4252. *
  4253. * [39] element ::= EmptyElemTag | STag content ETag
  4254. *
  4255. * [41] Attribute ::= Name Eq AttValue
  4256. */
  4257. static void
  4258. htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
  4259. const xmlChar *name;
  4260. const htmlElemDesc * info;
  4261. htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
  4262. int failed;
  4263. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  4264. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4265. "htmlParseElementInternal: context error\n", NULL, NULL);
  4266. return;
  4267. }
  4268. if (ctxt->instate == XML_PARSER_EOF)
  4269. return;
  4270. /* Capture start position */
  4271. if (ctxt->record_info) {
  4272. node_info.begin_pos = ctxt->input->consumed +
  4273. (CUR_PTR - ctxt->input->base);
  4274. node_info.begin_line = ctxt->input->line;
  4275. }
  4276. failed = htmlParseStartTag(ctxt);
  4277. name = ctxt->name;
  4278. if ((failed == -1) || (name == NULL)) {
  4279. if (CUR == '>')
  4280. NEXT;
  4281. return;
  4282. }
  4283. /*
  4284. * Lookup the info for that element.
  4285. */
  4286. info = htmlTagLookup(name);
  4287. if (info == NULL) {
  4288. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  4289. "Tag %s invalid\n", name, NULL);
  4290. }
  4291. /*
  4292. * Check for an Empty Element labeled the XML/SGML way
  4293. */
  4294. if ((CUR == '/') && (NXT(1) == '>')) {
  4295. SKIP(2);
  4296. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4297. ctxt->sax->endElement(ctxt->userData, name);
  4298. htmlnamePop(ctxt);
  4299. return;
  4300. }
  4301. if (CUR == '>') {
  4302. NEXT;
  4303. } else {
  4304. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  4305. "Couldn't find end of Start Tag %s\n", name, NULL);
  4306. /*
  4307. * end of parsing of this node.
  4308. */
  4309. if (xmlStrEqual(name, ctxt->name)) {
  4310. nodePop(ctxt);
  4311. htmlnamePop(ctxt);
  4312. }
  4313. if (ctxt->record_info)
  4314. htmlNodeInfoPush(ctxt, &node_info);
  4315. htmlParserFinishElementParsing(ctxt);
  4316. return;
  4317. }
  4318. /*
  4319. * Check for an Empty Element from DTD definition
  4320. */
  4321. if ((info != NULL) && (info->empty)) {
  4322. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4323. ctxt->sax->endElement(ctxt->userData, name);
  4324. htmlnamePop(ctxt);
  4325. return;
  4326. }
  4327. if (ctxt->record_info)
  4328. htmlNodeInfoPush(ctxt, &node_info);
  4329. }
  4330. /**
  4331. * htmlParseContentInternal:
  4332. * @ctxt: an HTML parser context
  4333. *
  4334. * Parse a content: comment, sub-element, reference or text.
  4335. * New version for non recursive htmlParseElementInternal
  4336. */
  4337. static void
  4338. htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
  4339. xmlChar *currentNode;
  4340. int depth;
  4341. const xmlChar *name;
  4342. currentNode = xmlStrdup(ctxt->name);
  4343. depth = ctxt->nameNr;
  4344. while (1) {
  4345. GROW;
  4346. if (ctxt->instate == XML_PARSER_EOF)
  4347. break;
  4348. /*
  4349. * Our tag or one of it's parent or children is ending.
  4350. */
  4351. if ((CUR == '<') && (NXT(1) == '/')) {
  4352. if (htmlParseEndTag(ctxt) &&
  4353. ((currentNode != NULL) || (ctxt->nameNr == 0))) {
  4354. if (currentNode != NULL)
  4355. xmlFree(currentNode);
  4356. currentNode = xmlStrdup(ctxt->name);
  4357. depth = ctxt->nameNr;
  4358. }
  4359. continue; /* while */
  4360. }
  4361. else if ((CUR == '<') &&
  4362. ((IS_ASCII_LETTER(NXT(1))) ||
  4363. (NXT(1) == '_') || (NXT(1) == ':'))) {
  4364. name = htmlParseHTMLName_nonInvasive(ctxt);
  4365. if (name == NULL) {
  4366. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  4367. "htmlParseStartTag: invalid element name\n",
  4368. NULL, NULL);
  4369. /* Dump the bogus tag like browsers do */
  4370. while ((CUR == 0) && (CUR != '>'))
  4371. NEXT;
  4372. htmlParserFinishElementParsing(ctxt);
  4373. if (currentNode != NULL)
  4374. xmlFree(currentNode);
  4375. currentNode = xmlStrdup(ctxt->name);
  4376. depth = ctxt->nameNr;
  4377. continue;
  4378. }
  4379. if (ctxt->name != NULL) {
  4380. if (htmlCheckAutoClose(name, ctxt->name) == 1) {
  4381. htmlAutoClose(ctxt, name);
  4382. continue;
  4383. }
  4384. }
  4385. }
  4386. /*
  4387. * Has this node been popped out during parsing of
  4388. * the next element
  4389. */
  4390. if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
  4391. (!xmlStrEqual(currentNode, ctxt->name)))
  4392. {
  4393. htmlParserFinishElementParsing(ctxt);
  4394. if (currentNode != NULL) xmlFree(currentNode);
  4395. currentNode = xmlStrdup(ctxt->name);
  4396. depth = ctxt->nameNr;
  4397. continue;
  4398. }
  4399. if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
  4400. (xmlStrEqual(currentNode, BAD_CAST"style")))) {
  4401. /*
  4402. * Handle SCRIPT/STYLE separately
  4403. */
  4404. htmlParseScript(ctxt);
  4405. } else {
  4406. /*
  4407. * Sometimes DOCTYPE arrives in the middle of the document
  4408. */
  4409. if ((CUR == '<') && (NXT(1) == '!') &&
  4410. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4411. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4412. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4413. (UPP(8) == 'E')) {
  4414. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  4415. "Misplaced DOCTYPE declaration\n",
  4416. BAD_CAST "DOCTYPE" , NULL);
  4417. htmlParseDocTypeDecl(ctxt);
  4418. }
  4419. /*
  4420. * First case : a comment
  4421. */
  4422. if ((CUR == '<') && (NXT(1) == '!') &&
  4423. (NXT(2) == '-') && (NXT(3) == '-')) {
  4424. htmlParseComment(ctxt);
  4425. }
  4426. /*
  4427. * Second case : a Processing Instruction.
  4428. */
  4429. else if ((CUR == '<') && (NXT(1) == '?')) {
  4430. htmlParsePI(ctxt);
  4431. }
  4432. /*
  4433. * Third case : a sub-element.
  4434. */
  4435. else if (CUR == '<') {
  4436. htmlParseElementInternal(ctxt);
  4437. if (currentNode != NULL) xmlFree(currentNode);
  4438. currentNode = xmlStrdup(ctxt->name);
  4439. depth = ctxt->nameNr;
  4440. }
  4441. /*
  4442. * Fourth case : a reference. If if has not been resolved,
  4443. * parsing returns it's Name, create the node
  4444. */
  4445. else if (CUR == '&') {
  4446. htmlParseReference(ctxt);
  4447. }
  4448. /*
  4449. * Fifth case : end of the resource
  4450. */
  4451. else if (CUR == 0) {
  4452. htmlAutoCloseOnEnd(ctxt);
  4453. break;
  4454. }
  4455. /*
  4456. * Last case, text. Note that References are handled directly.
  4457. */
  4458. else {
  4459. htmlParseCharData(ctxt);
  4460. }
  4461. }
  4462. GROW;
  4463. }
  4464. if (currentNode != NULL) xmlFree(currentNode);
  4465. }
  4466. /**
  4467. * htmlParseContent:
  4468. * @ctxt: an HTML parser context
  4469. *
  4470. * Parse a content: comment, sub-element, reference or text.
  4471. * This is the entry point when called from parser.c
  4472. */
  4473. void
  4474. __htmlParseContent(void *ctxt) {
  4475. if (ctxt != NULL)
  4476. htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
  4477. }
  4478. /**
  4479. * htmlParseDocument:
  4480. * @ctxt: an HTML parser context
  4481. *
  4482. * parse an HTML document (and build a tree if using the standard SAX
  4483. * interface).
  4484. *
  4485. * Returns 0, -1 in case of error. the parser context is augmented
  4486. * as a result of the parsing.
  4487. */
  4488. int
  4489. htmlParseDocument(htmlParserCtxtPtr ctxt) {
  4490. xmlChar start[4];
  4491. xmlCharEncoding enc;
  4492. xmlDtdPtr dtd;
  4493. xmlInitParser();
  4494. htmlDefaultSAXHandlerInit();
  4495. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  4496. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4497. "htmlParseDocument: context error\n", NULL, NULL);
  4498. return(XML_ERR_INTERNAL_ERROR);
  4499. }
  4500. ctxt->html = 1;
  4501. ctxt->linenumbers = 1;
  4502. GROW;
  4503. /*
  4504. * SAX: beginning of the document processing.
  4505. */
  4506. if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
  4507. ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
  4508. if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
  4509. ((ctxt->input->end - ctxt->input->cur) >= 4)) {
  4510. /*
  4511. * Get the 4 first bytes and decode the charset
  4512. * if enc != XML_CHAR_ENCODING_NONE
  4513. * plug some encoding conversion routines.
  4514. */
  4515. start[0] = RAW;
  4516. start[1] = NXT(1);
  4517. start[2] = NXT(2);
  4518. start[3] = NXT(3);
  4519. enc = xmlDetectCharEncoding(&start[0], 4);
  4520. if (enc != XML_CHAR_ENCODING_NONE) {
  4521. xmlSwitchEncoding(ctxt, enc);
  4522. }
  4523. }
  4524. /*
  4525. * Wipe out everything which is before the first '<'
  4526. */
  4527. SKIP_BLANKS;
  4528. if (CUR == 0) {
  4529. htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
  4530. "Document is empty\n", NULL, NULL);
  4531. }
  4532. if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
  4533. ctxt->sax->startDocument(ctxt->userData);
  4534. /*
  4535. * Parse possible comments and PIs before any content
  4536. */
  4537. while (((CUR == '<') && (NXT(1) == '!') &&
  4538. (NXT(2) == '-') && (NXT(3) == '-')) ||
  4539. ((CUR == '<') && (NXT(1) == '?'))) {
  4540. htmlParseComment(ctxt);
  4541. htmlParsePI(ctxt);
  4542. SKIP_BLANKS;
  4543. }
  4544. /*
  4545. * Then possibly doc type declaration(s) and more Misc
  4546. * (doctypedecl Misc*)?
  4547. */
  4548. if ((CUR == '<') && (NXT(1) == '!') &&
  4549. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4550. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4551. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4552. (UPP(8) == 'E')) {
  4553. htmlParseDocTypeDecl(ctxt);
  4554. }
  4555. SKIP_BLANKS;
  4556. /*
  4557. * Parse possible comments and PIs before any content
  4558. */
  4559. while (((CUR == '<') && (NXT(1) == '!') &&
  4560. (NXT(2) == '-') && (NXT(3) == '-')) ||
  4561. ((CUR == '<') && (NXT(1) == '?'))) {
  4562. htmlParseComment(ctxt);
  4563. htmlParsePI(ctxt);
  4564. SKIP_BLANKS;
  4565. }
  4566. /*
  4567. * Time to start parsing the tree itself
  4568. */
  4569. htmlParseContentInternal(ctxt);
  4570. /*
  4571. * autoclose
  4572. */
  4573. if (CUR == 0)
  4574. htmlAutoCloseOnEnd(ctxt);
  4575. /*
  4576. * SAX: end of the document processing.
  4577. */
  4578. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  4579. ctxt->sax->endDocument(ctxt->userData);
  4580. if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
  4581. dtd = xmlGetIntSubset(ctxt->myDoc);
  4582. if (dtd == NULL)
  4583. ctxt->myDoc->intSubset =
  4584. xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
  4585. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
  4586. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
  4587. }
  4588. if (! ctxt->wellFormed) return(-1);
  4589. return(0);
  4590. }
  4591. /************************************************************************
  4592. * *
  4593. * Parser contexts handling *
  4594. * *
  4595. ************************************************************************/
  4596. /**
  4597. * htmlInitParserCtxt:
  4598. * @ctxt: an HTML parser context
  4599. *
  4600. * Initialize a parser context
  4601. *
  4602. * Returns 0 in case of success and -1 in case of error
  4603. */
  4604. static int
  4605. htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
  4606. {
  4607. htmlSAXHandler *sax;
  4608. if (ctxt == NULL) return(-1);
  4609. memset(ctxt, 0, sizeof(htmlParserCtxt));
  4610. ctxt->dict = xmlDictCreate();
  4611. if (ctxt->dict == NULL) {
  4612. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4613. return(-1);
  4614. }
  4615. sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
  4616. if (sax == NULL) {
  4617. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4618. return(-1);
  4619. }
  4620. else
  4621. memset(sax, 0, sizeof(htmlSAXHandler));
  4622. /* Allocate the Input stack */
  4623. ctxt->inputTab = (htmlParserInputPtr *)
  4624. xmlMalloc(5 * sizeof(htmlParserInputPtr));
  4625. if (ctxt->inputTab == NULL) {
  4626. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4627. ctxt->inputNr = 0;
  4628. ctxt->inputMax = 0;
  4629. ctxt->input = NULL;
  4630. return(-1);
  4631. }
  4632. ctxt->inputNr = 0;
  4633. ctxt->inputMax = 5;
  4634. ctxt->input = NULL;
  4635. ctxt->version = NULL;
  4636. ctxt->encoding = NULL;
  4637. ctxt->standalone = -1;
  4638. ctxt->instate = XML_PARSER_START;
  4639. /* Allocate the Node stack */
  4640. ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
  4641. if (ctxt->nodeTab == NULL) {
  4642. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4643. ctxt->nodeNr = 0;
  4644. ctxt->nodeMax = 0;
  4645. ctxt->node = NULL;
  4646. ctxt->inputNr = 0;
  4647. ctxt->inputMax = 0;
  4648. ctxt->input = NULL;
  4649. return(-1);
  4650. }
  4651. ctxt->nodeNr = 0;
  4652. ctxt->nodeMax = 10;
  4653. ctxt->node = NULL;
  4654. /* Allocate the Name stack */
  4655. ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
  4656. if (ctxt->nameTab == NULL) {
  4657. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4658. ctxt->nameNr = 0;
  4659. ctxt->nameMax = 0;
  4660. ctxt->name = NULL;
  4661. ctxt->nodeNr = 0;
  4662. ctxt->nodeMax = 0;
  4663. ctxt->node = NULL;
  4664. ctxt->inputNr = 0;
  4665. ctxt->inputMax = 0;
  4666. ctxt->input = NULL;
  4667. return(-1);
  4668. }
  4669. ctxt->nameNr = 0;
  4670. ctxt->nameMax = 10;
  4671. ctxt->name = NULL;
  4672. ctxt->nodeInfoTab = NULL;
  4673. ctxt->nodeInfoNr = 0;
  4674. ctxt->nodeInfoMax = 0;
  4675. if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
  4676. else {
  4677. ctxt->sax = sax;
  4678. memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
  4679. }
  4680. ctxt->userData = ctxt;
  4681. ctxt->myDoc = NULL;
  4682. ctxt->wellFormed = 1;
  4683. ctxt->replaceEntities = 0;
  4684. ctxt->linenumbers = xmlLineNumbersDefaultValue;
  4685. ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
  4686. ctxt->html = 1;
  4687. ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
  4688. ctxt->vctxt.userData = ctxt;
  4689. ctxt->vctxt.error = xmlParserValidityError;
  4690. ctxt->vctxt.warning = xmlParserValidityWarning;
  4691. ctxt->record_info = 0;
  4692. ctxt->validate = 0;
  4693. ctxt->checkIndex = 0;
  4694. ctxt->catalogs = NULL;
  4695. xmlInitNodeInfoSeq(&ctxt->node_seq);
  4696. return(0);
  4697. }
  4698. /**
  4699. * htmlFreeParserCtxt:
  4700. * @ctxt: an HTML parser context
  4701. *
  4702. * Free all the memory used by a parser context. However the parsed
  4703. * document in ctxt->myDoc is not freed.
  4704. */
  4705. void
  4706. htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
  4707. {
  4708. xmlFreeParserCtxt(ctxt);
  4709. }
  4710. /**
  4711. * htmlNewParserCtxt:
  4712. *
  4713. * Allocate and initialize a new parser context.
  4714. *
  4715. * Returns the htmlParserCtxtPtr or NULL in case of allocation error
  4716. */
  4717. htmlParserCtxtPtr
  4718. htmlNewParserCtxt(void)
  4719. {
  4720. xmlParserCtxtPtr ctxt;
  4721. ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
  4722. if (ctxt == NULL) {
  4723. htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
  4724. return(NULL);
  4725. }
  4726. memset(ctxt, 0, sizeof(xmlParserCtxt));
  4727. if (htmlInitParserCtxt(ctxt) < 0) {
  4728. htmlFreeParserCtxt(ctxt);
  4729. return(NULL);
  4730. }
  4731. return(ctxt);
  4732. }
  4733. /**
  4734. * htmlCreateMemoryParserCtxt:
  4735. * @buffer: a pointer to a char array
  4736. * @size: the size of the array
  4737. *
  4738. * Create a parser context for an HTML in-memory document.
  4739. *
  4740. * Returns the new parser context or NULL
  4741. */
  4742. htmlParserCtxtPtr
  4743. htmlCreateMemoryParserCtxt(const char *buffer, int size) {
  4744. xmlParserCtxtPtr ctxt;
  4745. xmlParserInputPtr input;
  4746. xmlParserInputBufferPtr buf;
  4747. if (buffer == NULL)
  4748. return(NULL);
  4749. if (size <= 0)
  4750. return(NULL);
  4751. ctxt = htmlNewParserCtxt();
  4752. if (ctxt == NULL)
  4753. return(NULL);
  4754. buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
  4755. if (buf == NULL) return(NULL);
  4756. input = xmlNewInputStream(ctxt);
  4757. if (input == NULL) {
  4758. xmlFreeParserInputBuffer(buf);
  4759. xmlFreeParserCtxt(ctxt);
  4760. return(NULL);
  4761. }
  4762. input->filename = NULL;
  4763. input->buf = buf;
  4764. xmlBufResetInput(buf->buffer, input);
  4765. inputPush(ctxt, input);
  4766. return(ctxt);
  4767. }
  4768. /**
  4769. * htmlCreateDocParserCtxt:
  4770. * @cur: a pointer to an array of xmlChar
  4771. * @encoding: a free form C string describing the HTML document encoding, or NULL
  4772. *
  4773. * Create a parser context for an HTML document.
  4774. *
  4775. * TODO: check the need to add encoding handling there
  4776. *
  4777. * Returns the new parser context or NULL
  4778. */
  4779. static htmlParserCtxtPtr
  4780. htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
  4781. int len;
  4782. htmlParserCtxtPtr ctxt;
  4783. if (cur == NULL)
  4784. return(NULL);
  4785. len = xmlStrlen(cur);
  4786. ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
  4787. if (ctxt == NULL)
  4788. return(NULL);
  4789. if (encoding != NULL) {
  4790. xmlCharEncoding enc;
  4791. xmlCharEncodingHandlerPtr handler;
  4792. if (ctxt->input->encoding != NULL)
  4793. xmlFree((xmlChar *) ctxt->input->encoding);
  4794. ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
  4795. enc = xmlParseCharEncoding(encoding);
  4796. /*
  4797. * registered set of known encodings
  4798. */
  4799. if (enc != XML_CHAR_ENCODING_ERROR) {
  4800. xmlSwitchEncoding(ctxt, enc);
  4801. if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
  4802. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  4803. "Unsupported encoding %s\n",
  4804. (const xmlChar *) encoding, NULL);
  4805. }
  4806. } else {
  4807. /*
  4808. * fallback for unknown encodings
  4809. */
  4810. handler = xmlFindCharEncodingHandler((const char *) encoding);
  4811. if (handler != NULL) {
  4812. xmlSwitchToEncoding(ctxt, handler);
  4813. } else {
  4814. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  4815. "Unsupported encoding %s\n",
  4816. (const xmlChar *) encoding, NULL);
  4817. }
  4818. }
  4819. }
  4820. return(ctxt);
  4821. }
  4822. #ifdef LIBXML_PUSH_ENABLED
  4823. /************************************************************************
  4824. * *
  4825. * Progressive parsing interfaces *
  4826. * *
  4827. ************************************************************************/
  4828. /**
  4829. * htmlParseLookupSequence:
  4830. * @ctxt: an HTML parser context
  4831. * @first: the first char to lookup
  4832. * @next: the next char to lookup or zero
  4833. * @third: the next char to lookup or zero
  4834. * @ignoreattrval: skip over attribute values
  4835. *
  4836. * Try to find if a sequence (first, next, third) or just (first next) or
  4837. * (first) is available in the input stream.
  4838. * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  4839. * to avoid rescanning sequences of bytes, it DOES change the state of the
  4840. * parser, do not use liberally.
  4841. * This is basically similar to xmlParseLookupSequence()
  4842. *
  4843. * Returns the index to the current parsing point if the full sequence
  4844. * is available, -1 otherwise.
  4845. */
  4846. static int
  4847. htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
  4848. xmlChar next, xmlChar third, int ignoreattrval)
  4849. {
  4850. int base, len;
  4851. htmlParserInputPtr in;
  4852. const xmlChar *buf;
  4853. int invalue = 0;
  4854. char valdellim = 0x0;
  4855. in = ctxt->input;
  4856. if (in == NULL)
  4857. return (-1);
  4858. base = in->cur - in->base;
  4859. if (base < 0)
  4860. return (-1);
  4861. if (ctxt->checkIndex > base) {
  4862. base = ctxt->checkIndex;
  4863. /* Abuse hasPErefs member to restore current state. */
  4864. invalue = ctxt->hasPErefs & 1 ? 1 : 0;
  4865. }
  4866. if (in->buf == NULL) {
  4867. buf = in->base;
  4868. len = in->length;
  4869. } else {
  4870. buf = xmlBufContent(in->buf->buffer);
  4871. len = xmlBufUse(in->buf->buffer);
  4872. }
  4873. /* take into account the sequence length */
  4874. if (third)
  4875. len -= 2;
  4876. else if (next)
  4877. len--;
  4878. for (; base < len; base++) {
  4879. if (ignoreattrval) {
  4880. if (buf[base] == '"' || buf[base] == '\'') {
  4881. if (invalue) {
  4882. if (buf[base] == valdellim) {
  4883. invalue = 0;
  4884. continue;
  4885. }
  4886. } else {
  4887. valdellim = buf[base];
  4888. invalue = 1;
  4889. continue;
  4890. }
  4891. } else if (invalue) {
  4892. continue;
  4893. }
  4894. }
  4895. if (buf[base] == first) {
  4896. if (third != 0) {
  4897. if ((buf[base + 1] != next) || (buf[base + 2] != third))
  4898. continue;
  4899. } else if (next != 0) {
  4900. if (buf[base + 1] != next)
  4901. continue;
  4902. }
  4903. ctxt->checkIndex = 0;
  4904. #ifdef DEBUG_PUSH
  4905. if (next == 0)
  4906. xmlGenericError(xmlGenericErrorContext,
  4907. "HPP: lookup '%c' found at %d\n",
  4908. first, base);
  4909. else if (third == 0)
  4910. xmlGenericError(xmlGenericErrorContext,
  4911. "HPP: lookup '%c%c' found at %d\n",
  4912. first, next, base);
  4913. else
  4914. xmlGenericError(xmlGenericErrorContext,
  4915. "HPP: lookup '%c%c%c' found at %d\n",
  4916. first, next, third, base);
  4917. #endif
  4918. return (base - (in->cur - in->base));
  4919. }
  4920. }
  4921. ctxt->checkIndex = base;
  4922. /* Abuse hasPErefs member to track current state. */
  4923. if (invalue)
  4924. ctxt->hasPErefs |= 1;
  4925. else
  4926. ctxt->hasPErefs &= ~1;
  4927. #ifdef DEBUG_PUSH
  4928. if (next == 0)
  4929. xmlGenericError(xmlGenericErrorContext,
  4930. "HPP: lookup '%c' failed\n", first);
  4931. else if (third == 0)
  4932. xmlGenericError(xmlGenericErrorContext,
  4933. "HPP: lookup '%c%c' failed\n", first, next);
  4934. else
  4935. xmlGenericError(xmlGenericErrorContext,
  4936. "HPP: lookup '%c%c%c' failed\n", first, next,
  4937. third);
  4938. #endif
  4939. return (-1);
  4940. }
  4941. /**
  4942. * htmlParseLookupCommentEnd:
  4943. * @ctxt: an HTML parser context
  4944. *
  4945. * Try to find a comment end tag in the input stream
  4946. * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
  4947. * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
  4948. * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  4949. * to avoid rescanning sequences of bytes, it DOES change the state of the
  4950. * parser, do not use liberally.
  4951. * This wraps to htmlParseLookupSequence()
  4952. *
  4953. * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
  4954. */
  4955. static int
  4956. htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
  4957. {
  4958. int mark = 0;
  4959. int cur = CUR_PTR - BASE_PTR;
  4960. while (mark >= 0) {
  4961. mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
  4962. if ((mark < 0) ||
  4963. (NXT(mark+2) == '>') ||
  4964. ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
  4965. return mark;
  4966. }
  4967. ctxt->checkIndex = cur + mark + 1;
  4968. }
  4969. return mark;
  4970. }
  4971. /**
  4972. * htmlParseTryOrFinish:
  4973. * @ctxt: an HTML parser context
  4974. * @terminate: last chunk indicator
  4975. *
  4976. * Try to progress on parsing
  4977. *
  4978. * Returns zero if no parsing was possible
  4979. */
  4980. static int
  4981. htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
  4982. int ret = 0;
  4983. htmlParserInputPtr in;
  4984. ptrdiff_t avail = 0;
  4985. xmlChar cur, next;
  4986. htmlParserNodeInfo node_info;
  4987. #ifdef DEBUG_PUSH
  4988. switch (ctxt->instate) {
  4989. case XML_PARSER_EOF:
  4990. xmlGenericError(xmlGenericErrorContext,
  4991. "HPP: try EOF\n"); break;
  4992. case XML_PARSER_START:
  4993. xmlGenericError(xmlGenericErrorContext,
  4994. "HPP: try START\n"); break;
  4995. case XML_PARSER_MISC:
  4996. xmlGenericError(xmlGenericErrorContext,
  4997. "HPP: try MISC\n");break;
  4998. case XML_PARSER_COMMENT:
  4999. xmlGenericError(xmlGenericErrorContext,
  5000. "HPP: try COMMENT\n");break;
  5001. case XML_PARSER_PROLOG:
  5002. xmlGenericError(xmlGenericErrorContext,
  5003. "HPP: try PROLOG\n");break;
  5004. case XML_PARSER_START_TAG:
  5005. xmlGenericError(xmlGenericErrorContext,
  5006. "HPP: try START_TAG\n");break;
  5007. case XML_PARSER_CONTENT:
  5008. xmlGenericError(xmlGenericErrorContext,
  5009. "HPP: try CONTENT\n");break;
  5010. case XML_PARSER_CDATA_SECTION:
  5011. xmlGenericError(xmlGenericErrorContext,
  5012. "HPP: try CDATA_SECTION\n");break;
  5013. case XML_PARSER_END_TAG:
  5014. xmlGenericError(xmlGenericErrorContext,
  5015. "HPP: try END_TAG\n");break;
  5016. case XML_PARSER_ENTITY_DECL:
  5017. xmlGenericError(xmlGenericErrorContext,
  5018. "HPP: try ENTITY_DECL\n");break;
  5019. case XML_PARSER_ENTITY_VALUE:
  5020. xmlGenericError(xmlGenericErrorContext,
  5021. "HPP: try ENTITY_VALUE\n");break;
  5022. case XML_PARSER_ATTRIBUTE_VALUE:
  5023. xmlGenericError(xmlGenericErrorContext,
  5024. "HPP: try ATTRIBUTE_VALUE\n");break;
  5025. case XML_PARSER_DTD:
  5026. xmlGenericError(xmlGenericErrorContext,
  5027. "HPP: try DTD\n");break;
  5028. case XML_PARSER_EPILOG:
  5029. xmlGenericError(xmlGenericErrorContext,
  5030. "HPP: try EPILOG\n");break;
  5031. case XML_PARSER_PI:
  5032. xmlGenericError(xmlGenericErrorContext,
  5033. "HPP: try PI\n");break;
  5034. case XML_PARSER_SYSTEM_LITERAL:
  5035. xmlGenericError(xmlGenericErrorContext,
  5036. "HPP: try SYSTEM_LITERAL\n");break;
  5037. }
  5038. #endif
  5039. while (1) {
  5040. in = ctxt->input;
  5041. if (in == NULL) break;
  5042. if (in->buf == NULL)
  5043. avail = in->length - (in->cur - in->base);
  5044. else
  5045. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5046. (in->cur - in->base);
  5047. if ((avail == 0) && (terminate)) {
  5048. htmlAutoCloseOnEnd(ctxt);
  5049. if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
  5050. /*
  5051. * SAX: end of the document processing.
  5052. */
  5053. ctxt->instate = XML_PARSER_EOF;
  5054. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5055. ctxt->sax->endDocument(ctxt->userData);
  5056. }
  5057. }
  5058. if (avail < 1)
  5059. goto done;
  5060. /*
  5061. * This is done to make progress and avoid an infinite loop
  5062. * if a parsing attempt was aborted by hitting a NUL byte. After
  5063. * changing htmlCurrentChar, this probably isn't necessary anymore.
  5064. * We should consider removing this check.
  5065. */
  5066. cur = in->cur[0];
  5067. if (cur == 0) {
  5068. SKIP(1);
  5069. continue;
  5070. }
  5071. switch (ctxt->instate) {
  5072. case XML_PARSER_EOF:
  5073. /*
  5074. * Document parsing is done !
  5075. */
  5076. goto done;
  5077. case XML_PARSER_START:
  5078. /*
  5079. * Very first chars read from the document flow.
  5080. */
  5081. cur = in->cur[0];
  5082. if (IS_BLANK_CH(cur)) {
  5083. SKIP_BLANKS;
  5084. if (in->buf == NULL)
  5085. avail = in->length - (in->cur - in->base);
  5086. else
  5087. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5088. (in->cur - in->base);
  5089. }
  5090. if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
  5091. ctxt->sax->setDocumentLocator(ctxt->userData,
  5092. &xmlDefaultSAXLocator);
  5093. if ((ctxt->sax) && (ctxt->sax->startDocument) &&
  5094. (!ctxt->disableSAX))
  5095. ctxt->sax->startDocument(ctxt->userData);
  5096. cur = in->cur[0];
  5097. next = in->cur[1];
  5098. if ((cur == '<') && (next == '!') &&
  5099. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  5100. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  5101. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  5102. (UPP(8) == 'E')) {
  5103. if ((!terminate) &&
  5104. (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
  5105. goto done;
  5106. #ifdef DEBUG_PUSH
  5107. xmlGenericError(xmlGenericErrorContext,
  5108. "HPP: Parsing internal subset\n");
  5109. #endif
  5110. htmlParseDocTypeDecl(ctxt);
  5111. ctxt->instate = XML_PARSER_PROLOG;
  5112. #ifdef DEBUG_PUSH
  5113. xmlGenericError(xmlGenericErrorContext,
  5114. "HPP: entering PROLOG\n");
  5115. #endif
  5116. } else {
  5117. ctxt->instate = XML_PARSER_MISC;
  5118. #ifdef DEBUG_PUSH
  5119. xmlGenericError(xmlGenericErrorContext,
  5120. "HPP: entering MISC\n");
  5121. #endif
  5122. }
  5123. break;
  5124. case XML_PARSER_MISC:
  5125. SKIP_BLANKS;
  5126. if (in->buf == NULL)
  5127. avail = in->length - (in->cur - in->base);
  5128. else
  5129. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5130. (in->cur - in->base);
  5131. /*
  5132. * no chars in buffer
  5133. */
  5134. if (avail < 1)
  5135. goto done;
  5136. /*
  5137. * not enough chars in buffer
  5138. */
  5139. if (avail < 2) {
  5140. if (!terminate)
  5141. goto done;
  5142. else
  5143. next = ' ';
  5144. } else {
  5145. next = in->cur[1];
  5146. }
  5147. cur = in->cur[0];
  5148. if ((cur == '<') && (next == '!') &&
  5149. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5150. if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
  5151. goto done;
  5152. #ifdef DEBUG_PUSH
  5153. xmlGenericError(xmlGenericErrorContext,
  5154. "HPP: Parsing Comment\n");
  5155. #endif
  5156. htmlParseComment(ctxt);
  5157. ctxt->instate = XML_PARSER_MISC;
  5158. } else if ((cur == '<') && (next == '?')) {
  5159. if ((!terminate) &&
  5160. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5161. goto done;
  5162. #ifdef DEBUG_PUSH
  5163. xmlGenericError(xmlGenericErrorContext,
  5164. "HPP: Parsing PI\n");
  5165. #endif
  5166. htmlParsePI(ctxt);
  5167. ctxt->instate = XML_PARSER_MISC;
  5168. } else if ((cur == '<') && (next == '!') &&
  5169. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  5170. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  5171. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  5172. (UPP(8) == 'E')) {
  5173. if ((!terminate) &&
  5174. (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
  5175. goto done;
  5176. #ifdef DEBUG_PUSH
  5177. xmlGenericError(xmlGenericErrorContext,
  5178. "HPP: Parsing internal subset\n");
  5179. #endif
  5180. htmlParseDocTypeDecl(ctxt);
  5181. ctxt->instate = XML_PARSER_PROLOG;
  5182. #ifdef DEBUG_PUSH
  5183. xmlGenericError(xmlGenericErrorContext,
  5184. "HPP: entering PROLOG\n");
  5185. #endif
  5186. } else if ((cur == '<') && (next == '!') &&
  5187. (avail < 9)) {
  5188. goto done;
  5189. } else {
  5190. ctxt->instate = XML_PARSER_CONTENT;
  5191. #ifdef DEBUG_PUSH
  5192. xmlGenericError(xmlGenericErrorContext,
  5193. "HPP: entering START_TAG\n");
  5194. #endif
  5195. }
  5196. break;
  5197. case XML_PARSER_PROLOG:
  5198. SKIP_BLANKS;
  5199. if (in->buf == NULL)
  5200. avail = in->length - (in->cur - in->base);
  5201. else
  5202. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5203. (in->cur - in->base);
  5204. if (avail < 2)
  5205. goto done;
  5206. cur = in->cur[0];
  5207. next = in->cur[1];
  5208. if ((cur == '<') && (next == '!') &&
  5209. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5210. if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
  5211. goto done;
  5212. #ifdef DEBUG_PUSH
  5213. xmlGenericError(xmlGenericErrorContext,
  5214. "HPP: Parsing Comment\n");
  5215. #endif
  5216. htmlParseComment(ctxt);
  5217. ctxt->instate = XML_PARSER_PROLOG;
  5218. } else if ((cur == '<') && (next == '?')) {
  5219. if ((!terminate) &&
  5220. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5221. goto done;
  5222. #ifdef DEBUG_PUSH
  5223. xmlGenericError(xmlGenericErrorContext,
  5224. "HPP: Parsing PI\n");
  5225. #endif
  5226. htmlParsePI(ctxt);
  5227. ctxt->instate = XML_PARSER_PROLOG;
  5228. } else if ((cur == '<') && (next == '!') &&
  5229. (avail < 4)) {
  5230. goto done;
  5231. } else {
  5232. ctxt->instate = XML_PARSER_CONTENT;
  5233. #ifdef DEBUG_PUSH
  5234. xmlGenericError(xmlGenericErrorContext,
  5235. "HPP: entering START_TAG\n");
  5236. #endif
  5237. }
  5238. break;
  5239. case XML_PARSER_EPILOG:
  5240. if (in->buf == NULL)
  5241. avail = in->length - (in->cur - in->base);
  5242. else
  5243. avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
  5244. (in->cur - in->base);
  5245. if (avail < 1)
  5246. goto done;
  5247. cur = in->cur[0];
  5248. if (IS_BLANK_CH(cur)) {
  5249. htmlParseCharData(ctxt);
  5250. goto done;
  5251. }
  5252. if (avail < 2)
  5253. goto done;
  5254. next = in->cur[1];
  5255. if ((cur == '<') && (next == '!') &&
  5256. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5257. if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
  5258. goto done;
  5259. #ifdef DEBUG_PUSH
  5260. xmlGenericError(xmlGenericErrorContext,
  5261. "HPP: Parsing Comment\n");
  5262. #endif
  5263. htmlParseComment(ctxt);
  5264. ctxt->instate = XML_PARSER_EPILOG;
  5265. } else if ((cur == '<') && (next == '?')) {
  5266. if ((!terminate) &&
  5267. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5268. goto done;
  5269. #ifdef DEBUG_PUSH
  5270. xmlGenericError(xmlGenericErrorContext,
  5271. "HPP: Parsing PI\n");
  5272. #endif
  5273. htmlParsePI(ctxt);
  5274. ctxt->instate = XML_PARSER_EPILOG;
  5275. } else if ((cur == '<') && (next == '!') &&
  5276. (avail < 4)) {
  5277. goto done;
  5278. } else {
  5279. ctxt->errNo = XML_ERR_DOCUMENT_END;
  5280. ctxt->wellFormed = 0;
  5281. ctxt->instate = XML_PARSER_EOF;
  5282. #ifdef DEBUG_PUSH
  5283. xmlGenericError(xmlGenericErrorContext,
  5284. "HPP: entering EOF\n");
  5285. #endif
  5286. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5287. ctxt->sax->endDocument(ctxt->userData);
  5288. goto done;
  5289. }
  5290. break;
  5291. case XML_PARSER_START_TAG: {
  5292. const xmlChar *name;
  5293. int failed;
  5294. const htmlElemDesc * info;
  5295. /*
  5296. * no chars in buffer
  5297. */
  5298. if (avail < 1)
  5299. goto done;
  5300. /*
  5301. * not enough chars in buffer
  5302. */
  5303. if (avail < 2) {
  5304. if (!terminate)
  5305. goto done;
  5306. else
  5307. next = ' ';
  5308. } else {
  5309. next = in->cur[1];
  5310. }
  5311. cur = in->cur[0];
  5312. if (cur != '<') {
  5313. ctxt->instate = XML_PARSER_CONTENT;
  5314. #ifdef DEBUG_PUSH
  5315. xmlGenericError(xmlGenericErrorContext,
  5316. "HPP: entering CONTENT\n");
  5317. #endif
  5318. break;
  5319. }
  5320. if (next == '/') {
  5321. ctxt->instate = XML_PARSER_END_TAG;
  5322. ctxt->checkIndex = 0;
  5323. #ifdef DEBUG_PUSH
  5324. xmlGenericError(xmlGenericErrorContext,
  5325. "HPP: entering END_TAG\n");
  5326. #endif
  5327. break;
  5328. }
  5329. if ((!terminate) &&
  5330. (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
  5331. goto done;
  5332. /* Capture start position */
  5333. if (ctxt->record_info) {
  5334. node_info.begin_pos = ctxt->input->consumed +
  5335. (CUR_PTR - ctxt->input->base);
  5336. node_info.begin_line = ctxt->input->line;
  5337. }
  5338. failed = htmlParseStartTag(ctxt);
  5339. name = ctxt->name;
  5340. if ((failed == -1) ||
  5341. (name == NULL)) {
  5342. if (CUR == '>')
  5343. NEXT;
  5344. break;
  5345. }
  5346. /*
  5347. * Lookup the info for that element.
  5348. */
  5349. info = htmlTagLookup(name);
  5350. if (info == NULL) {
  5351. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  5352. "Tag %s invalid\n", name, NULL);
  5353. }
  5354. /*
  5355. * Check for an Empty Element labeled the XML/SGML way
  5356. */
  5357. if ((CUR == '/') && (NXT(1) == '>')) {
  5358. SKIP(2);
  5359. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  5360. ctxt->sax->endElement(ctxt->userData, name);
  5361. htmlnamePop(ctxt);
  5362. ctxt->instate = XML_PARSER_CONTENT;
  5363. #ifdef DEBUG_PUSH
  5364. xmlGenericError(xmlGenericErrorContext,
  5365. "HPP: entering CONTENT\n");
  5366. #endif
  5367. break;
  5368. }
  5369. if (CUR == '>') {
  5370. NEXT;
  5371. } else {
  5372. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  5373. "Couldn't find end of Start Tag %s\n",
  5374. name, NULL);
  5375. /*
  5376. * end of parsing of this node.
  5377. */
  5378. if (xmlStrEqual(name, ctxt->name)) {
  5379. nodePop(ctxt);
  5380. htmlnamePop(ctxt);
  5381. }
  5382. if (ctxt->record_info)
  5383. htmlNodeInfoPush(ctxt, &node_info);
  5384. ctxt->instate = XML_PARSER_CONTENT;
  5385. #ifdef DEBUG_PUSH
  5386. xmlGenericError(xmlGenericErrorContext,
  5387. "HPP: entering CONTENT\n");
  5388. #endif
  5389. break;
  5390. }
  5391. /*
  5392. * Check for an Empty Element from DTD definition
  5393. */
  5394. if ((info != NULL) && (info->empty)) {
  5395. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  5396. ctxt->sax->endElement(ctxt->userData, name);
  5397. htmlnamePop(ctxt);
  5398. }
  5399. if (ctxt->record_info)
  5400. htmlNodeInfoPush(ctxt, &node_info);
  5401. ctxt->instate = XML_PARSER_CONTENT;
  5402. #ifdef DEBUG_PUSH
  5403. xmlGenericError(xmlGenericErrorContext,
  5404. "HPP: entering CONTENT\n");
  5405. #endif
  5406. break;
  5407. }
  5408. case XML_PARSER_CONTENT: {
  5409. xmlChar chr[2] = { 0, 0 };
  5410. /*
  5411. * Handle preparsed entities and charRef
  5412. */
  5413. if (ctxt->token != 0) {
  5414. chr[0] = (xmlChar) ctxt->token;
  5415. htmlCheckParagraph(ctxt);
  5416. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  5417. ctxt->sax->characters(ctxt->userData, chr, 1);
  5418. ctxt->token = 0;
  5419. ctxt->checkIndex = 0;
  5420. }
  5421. if ((avail == 1) && (terminate)) {
  5422. cur = in->cur[0];
  5423. if ((cur != '<') && (cur != '&')) {
  5424. if (ctxt->sax != NULL) {
  5425. chr[0] = cur;
  5426. if (IS_BLANK_CH(cur)) {
  5427. if (ctxt->keepBlanks) {
  5428. if (ctxt->sax->characters != NULL)
  5429. ctxt->sax->characters(
  5430. ctxt->userData, chr, 1);
  5431. } else {
  5432. if (ctxt->sax->ignorableWhitespace != NULL)
  5433. ctxt->sax->ignorableWhitespace(
  5434. ctxt->userData, chr, 1);
  5435. }
  5436. } else {
  5437. htmlCheckParagraph(ctxt);
  5438. if (ctxt->sax->characters != NULL)
  5439. ctxt->sax->characters(
  5440. ctxt->userData, chr, 1);
  5441. }
  5442. }
  5443. ctxt->token = 0;
  5444. ctxt->checkIndex = 0;
  5445. in->cur++;
  5446. break;
  5447. }
  5448. }
  5449. if (avail < 2)
  5450. goto done;
  5451. cur = in->cur[0];
  5452. next = in->cur[1];
  5453. if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
  5454. (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
  5455. /*
  5456. * Handle SCRIPT/STYLE separately
  5457. */
  5458. if (!terminate) {
  5459. int idx;
  5460. xmlChar val;
  5461. idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
  5462. if (idx < 0)
  5463. goto done;
  5464. val = in->cur[idx + 2];
  5465. if (val == 0) /* bad cut of input */
  5466. goto done;
  5467. }
  5468. htmlParseScript(ctxt);
  5469. if ((cur == '<') && (next == '/')) {
  5470. ctxt->instate = XML_PARSER_END_TAG;
  5471. ctxt->checkIndex = 0;
  5472. #ifdef DEBUG_PUSH
  5473. xmlGenericError(xmlGenericErrorContext,
  5474. "HPP: entering END_TAG\n");
  5475. #endif
  5476. break;
  5477. }
  5478. } else {
  5479. /*
  5480. * Sometimes DOCTYPE arrives in the middle of the document
  5481. */
  5482. if ((cur == '<') && (next == '!') &&
  5483. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  5484. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  5485. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  5486. (UPP(8) == 'E')) {
  5487. if ((!terminate) &&
  5488. (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
  5489. goto done;
  5490. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  5491. "Misplaced DOCTYPE declaration\n",
  5492. BAD_CAST "DOCTYPE" , NULL);
  5493. htmlParseDocTypeDecl(ctxt);
  5494. } else if ((cur == '<') && (next == '!') &&
  5495. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  5496. if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
  5497. goto done;
  5498. #ifdef DEBUG_PUSH
  5499. xmlGenericError(xmlGenericErrorContext,
  5500. "HPP: Parsing Comment\n");
  5501. #endif
  5502. htmlParseComment(ctxt);
  5503. ctxt->instate = XML_PARSER_CONTENT;
  5504. } else if ((cur == '<') && (next == '?')) {
  5505. if ((!terminate) &&
  5506. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5507. goto done;
  5508. #ifdef DEBUG_PUSH
  5509. xmlGenericError(xmlGenericErrorContext,
  5510. "HPP: Parsing PI\n");
  5511. #endif
  5512. htmlParsePI(ctxt);
  5513. ctxt->instate = XML_PARSER_CONTENT;
  5514. } else if ((cur == '<') && (next == '!') && (avail < 4)) {
  5515. goto done;
  5516. } else if ((cur == '<') && (next == '/')) {
  5517. ctxt->instate = XML_PARSER_END_TAG;
  5518. ctxt->checkIndex = 0;
  5519. #ifdef DEBUG_PUSH
  5520. xmlGenericError(xmlGenericErrorContext,
  5521. "HPP: entering END_TAG\n");
  5522. #endif
  5523. break;
  5524. } else if (cur == '<') {
  5525. if ((!terminate) && (next == 0))
  5526. goto done;
  5527. ctxt->instate = XML_PARSER_START_TAG;
  5528. ctxt->checkIndex = 0;
  5529. #ifdef DEBUG_PUSH
  5530. xmlGenericError(xmlGenericErrorContext,
  5531. "HPP: entering START_TAG\n");
  5532. #endif
  5533. break;
  5534. } else {
  5535. /*
  5536. * check that the text sequence is complete
  5537. * before handing out the data to the parser
  5538. * to avoid problems with erroneous end of
  5539. * data detection.
  5540. */
  5541. if ((!terminate) &&
  5542. (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
  5543. goto done;
  5544. ctxt->checkIndex = 0;
  5545. #ifdef DEBUG_PUSH
  5546. xmlGenericError(xmlGenericErrorContext,
  5547. "HPP: Parsing char data\n");
  5548. #endif
  5549. while ((ctxt->instate != XML_PARSER_EOF) &&
  5550. (cur != '<') && (in->cur < in->end)) {
  5551. if (cur == '&') {
  5552. htmlParseReference(ctxt);
  5553. } else {
  5554. htmlParseCharData(ctxt);
  5555. }
  5556. cur = in->cur[0];
  5557. }
  5558. }
  5559. }
  5560. break;
  5561. }
  5562. case XML_PARSER_END_TAG:
  5563. if (avail < 2)
  5564. goto done;
  5565. if ((!terminate) &&
  5566. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
  5567. goto done;
  5568. htmlParseEndTag(ctxt);
  5569. if (ctxt->nameNr == 0) {
  5570. ctxt->instate = XML_PARSER_EPILOG;
  5571. } else {
  5572. ctxt->instate = XML_PARSER_CONTENT;
  5573. }
  5574. ctxt->checkIndex = 0;
  5575. #ifdef DEBUG_PUSH
  5576. xmlGenericError(xmlGenericErrorContext,
  5577. "HPP: entering CONTENT\n");
  5578. #endif
  5579. break;
  5580. case XML_PARSER_CDATA_SECTION:
  5581. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5582. "HPP: internal error, state == CDATA\n",
  5583. NULL, NULL);
  5584. ctxt->instate = XML_PARSER_CONTENT;
  5585. ctxt->checkIndex = 0;
  5586. #ifdef DEBUG_PUSH
  5587. xmlGenericError(xmlGenericErrorContext,
  5588. "HPP: entering CONTENT\n");
  5589. #endif
  5590. break;
  5591. case XML_PARSER_DTD:
  5592. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5593. "HPP: internal error, state == DTD\n",
  5594. NULL, NULL);
  5595. ctxt->instate = XML_PARSER_CONTENT;
  5596. ctxt->checkIndex = 0;
  5597. #ifdef DEBUG_PUSH
  5598. xmlGenericError(xmlGenericErrorContext,
  5599. "HPP: entering CONTENT\n");
  5600. #endif
  5601. break;
  5602. case XML_PARSER_COMMENT:
  5603. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5604. "HPP: internal error, state == COMMENT\n",
  5605. NULL, NULL);
  5606. ctxt->instate = XML_PARSER_CONTENT;
  5607. ctxt->checkIndex = 0;
  5608. #ifdef DEBUG_PUSH
  5609. xmlGenericError(xmlGenericErrorContext,
  5610. "HPP: entering CONTENT\n");
  5611. #endif
  5612. break;
  5613. case XML_PARSER_PI:
  5614. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5615. "HPP: internal error, state == PI\n",
  5616. NULL, NULL);
  5617. ctxt->instate = XML_PARSER_CONTENT;
  5618. ctxt->checkIndex = 0;
  5619. #ifdef DEBUG_PUSH
  5620. xmlGenericError(xmlGenericErrorContext,
  5621. "HPP: entering CONTENT\n");
  5622. #endif
  5623. break;
  5624. case XML_PARSER_ENTITY_DECL:
  5625. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5626. "HPP: internal error, state == ENTITY_DECL\n",
  5627. NULL, NULL);
  5628. ctxt->instate = XML_PARSER_CONTENT;
  5629. ctxt->checkIndex = 0;
  5630. #ifdef DEBUG_PUSH
  5631. xmlGenericError(xmlGenericErrorContext,
  5632. "HPP: entering CONTENT\n");
  5633. #endif
  5634. break;
  5635. case XML_PARSER_ENTITY_VALUE:
  5636. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5637. "HPP: internal error, state == ENTITY_VALUE\n",
  5638. NULL, NULL);
  5639. ctxt->instate = XML_PARSER_CONTENT;
  5640. ctxt->checkIndex = 0;
  5641. #ifdef DEBUG_PUSH
  5642. xmlGenericError(xmlGenericErrorContext,
  5643. "HPP: entering DTD\n");
  5644. #endif
  5645. break;
  5646. case XML_PARSER_ATTRIBUTE_VALUE:
  5647. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5648. "HPP: internal error, state == ATTRIBUTE_VALUE\n",
  5649. NULL, NULL);
  5650. ctxt->instate = XML_PARSER_START_TAG;
  5651. ctxt->checkIndex = 0;
  5652. #ifdef DEBUG_PUSH
  5653. xmlGenericError(xmlGenericErrorContext,
  5654. "HPP: entering START_TAG\n");
  5655. #endif
  5656. break;
  5657. case XML_PARSER_SYSTEM_LITERAL:
  5658. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5659. "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
  5660. NULL, NULL);
  5661. ctxt->instate = XML_PARSER_CONTENT;
  5662. ctxt->checkIndex = 0;
  5663. #ifdef DEBUG_PUSH
  5664. xmlGenericError(xmlGenericErrorContext,
  5665. "HPP: entering CONTENT\n");
  5666. #endif
  5667. break;
  5668. case XML_PARSER_IGNORE:
  5669. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5670. "HPP: internal error, state == XML_PARSER_IGNORE\n",
  5671. NULL, NULL);
  5672. ctxt->instate = XML_PARSER_CONTENT;
  5673. ctxt->checkIndex = 0;
  5674. #ifdef DEBUG_PUSH
  5675. xmlGenericError(xmlGenericErrorContext,
  5676. "HPP: entering CONTENT\n");
  5677. #endif
  5678. break;
  5679. case XML_PARSER_PUBLIC_LITERAL:
  5680. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5681. "HPP: internal error, state == XML_PARSER_LITERAL\n",
  5682. NULL, NULL);
  5683. ctxt->instate = XML_PARSER_CONTENT;
  5684. ctxt->checkIndex = 0;
  5685. #ifdef DEBUG_PUSH
  5686. xmlGenericError(xmlGenericErrorContext,
  5687. "HPP: entering CONTENT\n");
  5688. #endif
  5689. break;
  5690. }
  5691. }
  5692. done:
  5693. if ((avail == 0) && (terminate)) {
  5694. htmlAutoCloseOnEnd(ctxt);
  5695. if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
  5696. /*
  5697. * SAX: end of the document processing.
  5698. */
  5699. ctxt->instate = XML_PARSER_EOF;
  5700. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5701. ctxt->sax->endDocument(ctxt->userData);
  5702. }
  5703. }
  5704. if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
  5705. ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
  5706. (ctxt->instate == XML_PARSER_EPILOG))) {
  5707. xmlDtdPtr dtd;
  5708. dtd = xmlGetIntSubset(ctxt->myDoc);
  5709. if (dtd == NULL)
  5710. ctxt->myDoc->intSubset =
  5711. xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
  5712. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
  5713. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
  5714. }
  5715. #ifdef DEBUG_PUSH
  5716. xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
  5717. #endif
  5718. return(ret);
  5719. }
  5720. /**
  5721. * htmlParseChunk:
  5722. * @ctxt: an HTML parser context
  5723. * @chunk: an char array
  5724. * @size: the size in byte of the chunk
  5725. * @terminate: last chunk indicator
  5726. *
  5727. * Parse a Chunk of memory
  5728. *
  5729. * Returns zero if no error, the xmlParserErrors otherwise.
  5730. */
  5731. int
  5732. htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
  5733. int terminate) {
  5734. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  5735. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5736. "htmlParseChunk: context error\n", NULL, NULL);
  5737. return(XML_ERR_INTERNAL_ERROR);
  5738. }
  5739. if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
  5740. (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
  5741. size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
  5742. size_t cur = ctxt->input->cur - ctxt->input->base;
  5743. int res;
  5744. res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
  5745. xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
  5746. if (res < 0) {
  5747. ctxt->errNo = XML_PARSER_EOF;
  5748. ctxt->disableSAX = 1;
  5749. return (XML_PARSER_EOF);
  5750. }
  5751. #ifdef DEBUG_PUSH
  5752. xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
  5753. #endif
  5754. #if 0
  5755. if ((terminate) || (ctxt->input->buf->buffer->use > 80))
  5756. htmlParseTryOrFinish(ctxt, terminate);
  5757. #endif
  5758. } else if (ctxt->instate != XML_PARSER_EOF) {
  5759. if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
  5760. xmlParserInputBufferPtr in = ctxt->input->buf;
  5761. if ((in->encoder != NULL) && (in->buffer != NULL) &&
  5762. (in->raw != NULL)) {
  5763. int nbchars;
  5764. size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
  5765. size_t current = ctxt->input->cur - ctxt->input->base;
  5766. nbchars = xmlCharEncInput(in, terminate);
  5767. xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
  5768. if (nbchars < 0) {
  5769. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  5770. "encoder error\n", NULL, NULL);
  5771. return(XML_ERR_INVALID_ENCODING);
  5772. }
  5773. }
  5774. }
  5775. }
  5776. htmlParseTryOrFinish(ctxt, terminate);
  5777. if (terminate) {
  5778. if ((ctxt->instate != XML_PARSER_EOF) &&
  5779. (ctxt->instate != XML_PARSER_EPILOG) &&
  5780. (ctxt->instate != XML_PARSER_MISC)) {
  5781. ctxt->errNo = XML_ERR_DOCUMENT_END;
  5782. ctxt->wellFormed = 0;
  5783. }
  5784. if (ctxt->instate != XML_PARSER_EOF) {
  5785. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5786. ctxt->sax->endDocument(ctxt->userData);
  5787. }
  5788. ctxt->instate = XML_PARSER_EOF;
  5789. }
  5790. return((xmlParserErrors) ctxt->errNo);
  5791. }
  5792. /************************************************************************
  5793. * *
  5794. * User entry points *
  5795. * *
  5796. ************************************************************************/
  5797. /**
  5798. * htmlCreatePushParserCtxt:
  5799. * @sax: a SAX handler
  5800. * @user_data: The user data returned on SAX callbacks
  5801. * @chunk: a pointer to an array of chars
  5802. * @size: number of chars in the array
  5803. * @filename: an optional file name or URI
  5804. * @enc: an optional encoding
  5805. *
  5806. * Create a parser context for using the HTML parser in push mode
  5807. * The value of @filename is used for fetching external entities
  5808. * and error/warning reports.
  5809. *
  5810. * Returns the new parser context or NULL
  5811. */
  5812. htmlParserCtxtPtr
  5813. htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
  5814. const char *chunk, int size, const char *filename,
  5815. xmlCharEncoding enc) {
  5816. htmlParserCtxtPtr ctxt;
  5817. htmlParserInputPtr inputStream;
  5818. xmlParserInputBufferPtr buf;
  5819. xmlInitParser();
  5820. buf = xmlAllocParserInputBuffer(enc);
  5821. if (buf == NULL) return(NULL);
  5822. ctxt = htmlNewParserCtxt();
  5823. if (ctxt == NULL) {
  5824. xmlFreeParserInputBuffer(buf);
  5825. return(NULL);
  5826. }
  5827. if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
  5828. ctxt->charset=XML_CHAR_ENCODING_UTF8;
  5829. if (sax != NULL) {
  5830. if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
  5831. xmlFree(ctxt->sax);
  5832. ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
  5833. if (ctxt->sax == NULL) {
  5834. xmlFree(buf);
  5835. xmlFree(ctxt);
  5836. return(NULL);
  5837. }
  5838. memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
  5839. if (user_data != NULL)
  5840. ctxt->userData = user_data;
  5841. }
  5842. if (filename == NULL) {
  5843. ctxt->directory = NULL;
  5844. } else {
  5845. ctxt->directory = xmlParserGetDirectory(filename);
  5846. }
  5847. inputStream = htmlNewInputStream(ctxt);
  5848. if (inputStream == NULL) {
  5849. xmlFreeParserCtxt(ctxt);
  5850. xmlFree(buf);
  5851. return(NULL);
  5852. }
  5853. if (filename == NULL)
  5854. inputStream->filename = NULL;
  5855. else
  5856. inputStream->filename = (char *)
  5857. xmlCanonicPath((const xmlChar *) filename);
  5858. inputStream->buf = buf;
  5859. xmlBufResetInput(buf->buffer, inputStream);
  5860. inputPush(ctxt, inputStream);
  5861. if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
  5862. (ctxt->input->buf != NULL)) {
  5863. size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
  5864. size_t cur = ctxt->input->cur - ctxt->input->base;
  5865. xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
  5866. xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
  5867. #ifdef DEBUG_PUSH
  5868. xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
  5869. #endif
  5870. }
  5871. ctxt->progressive = 1;
  5872. return(ctxt);
  5873. }
  5874. #endif /* LIBXML_PUSH_ENABLED */
  5875. /**
  5876. * htmlSAXParseDoc:
  5877. * @cur: a pointer to an array of xmlChar
  5878. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5879. * @sax: the SAX handler block
  5880. * @userData: if using SAX, this pointer will be provided on callbacks.
  5881. *
  5882. * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
  5883. * to handle parse events. If sax is NULL, fallback to the default DOM
  5884. * behavior and return a tree.
  5885. *
  5886. * Returns the resulting document tree unless SAX is NULL or the document is
  5887. * not well formed.
  5888. */
  5889. htmlDocPtr
  5890. htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
  5891. htmlSAXHandlerPtr sax, void *userData) {
  5892. htmlDocPtr ret;
  5893. htmlParserCtxtPtr ctxt;
  5894. xmlInitParser();
  5895. if (cur == NULL) return(NULL);
  5896. ctxt = htmlCreateDocParserCtxt(cur, encoding);
  5897. if (ctxt == NULL) return(NULL);
  5898. if (sax != NULL) {
  5899. if (ctxt->sax != NULL) xmlFree (ctxt->sax);
  5900. ctxt->sax = sax;
  5901. ctxt->userData = userData;
  5902. }
  5903. htmlParseDocument(ctxt);
  5904. ret = ctxt->myDoc;
  5905. if (sax != NULL) {
  5906. ctxt->sax = NULL;
  5907. ctxt->userData = NULL;
  5908. }
  5909. htmlFreeParserCtxt(ctxt);
  5910. return(ret);
  5911. }
  5912. /**
  5913. * htmlParseDoc:
  5914. * @cur: a pointer to an array of xmlChar
  5915. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5916. *
  5917. * parse an HTML in-memory document and build a tree.
  5918. *
  5919. * Returns the resulting document tree
  5920. */
  5921. htmlDocPtr
  5922. htmlParseDoc(const xmlChar *cur, const char *encoding) {
  5923. return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
  5924. }
  5925. /**
  5926. * htmlCreateFileParserCtxt:
  5927. * @filename: the filename
  5928. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5929. *
  5930. * Create a parser context for a file content.
  5931. * Automatic support for ZLIB/Compress compressed document is provided
  5932. * by default if found at compile-time.
  5933. *
  5934. * Returns the new parser context or NULL
  5935. */
  5936. htmlParserCtxtPtr
  5937. htmlCreateFileParserCtxt(const char *filename, const char *encoding)
  5938. {
  5939. htmlParserCtxtPtr ctxt;
  5940. htmlParserInputPtr inputStream;
  5941. char *canonicFilename;
  5942. /* htmlCharEncoding enc; */
  5943. xmlChar *content, *content_line = (xmlChar *) "charset=";
  5944. if (filename == NULL)
  5945. return(NULL);
  5946. ctxt = htmlNewParserCtxt();
  5947. if (ctxt == NULL) {
  5948. return(NULL);
  5949. }
  5950. canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
  5951. if (canonicFilename == NULL) {
  5952. #ifdef LIBXML_SAX1_ENABLED
  5953. if (xmlDefaultSAXHandler.error != NULL) {
  5954. xmlDefaultSAXHandler.error(NULL, "out of memory\n");
  5955. }
  5956. #endif
  5957. xmlFreeParserCtxt(ctxt);
  5958. return(NULL);
  5959. }
  5960. inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
  5961. xmlFree(canonicFilename);
  5962. if (inputStream == NULL) {
  5963. xmlFreeParserCtxt(ctxt);
  5964. return(NULL);
  5965. }
  5966. inputPush(ctxt, inputStream);
  5967. /* set encoding */
  5968. if (encoding) {
  5969. size_t l = strlen(encoding);
  5970. if (l < 1000) {
  5971. content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
  5972. if (content) {
  5973. strcpy ((char *)content, (char *)content_line);
  5974. strcat ((char *)content, (char *)encoding);
  5975. htmlCheckEncoding (ctxt, content);
  5976. xmlFree (content);
  5977. }
  5978. }
  5979. }
  5980. return(ctxt);
  5981. }
  5982. /**
  5983. * htmlSAXParseFile:
  5984. * @filename: the filename
  5985. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5986. * @sax: the SAX handler block
  5987. * @userData: if using SAX, this pointer will be provided on callbacks.
  5988. *
  5989. * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
  5990. * compressed document is provided by default if found at compile-time.
  5991. * It use the given SAX function block to handle the parsing callback.
  5992. * If sax is NULL, fallback to the default DOM tree building routines.
  5993. *
  5994. * Returns the resulting document tree unless SAX is NULL or the document is
  5995. * not well formed.
  5996. */
  5997. htmlDocPtr
  5998. htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
  5999. void *userData) {
  6000. htmlDocPtr ret;
  6001. htmlParserCtxtPtr ctxt;
  6002. htmlSAXHandlerPtr oldsax = NULL;
  6003. xmlInitParser();
  6004. ctxt = htmlCreateFileParserCtxt(filename, encoding);
  6005. if (ctxt == NULL) return(NULL);
  6006. if (sax != NULL) {
  6007. oldsax = ctxt->sax;
  6008. ctxt->sax = sax;
  6009. ctxt->userData = userData;
  6010. }
  6011. htmlParseDocument(ctxt);
  6012. ret = ctxt->myDoc;
  6013. if (sax != NULL) {
  6014. ctxt->sax = oldsax;
  6015. ctxt->userData = NULL;
  6016. }
  6017. htmlFreeParserCtxt(ctxt);
  6018. return(ret);
  6019. }
  6020. /**
  6021. * htmlParseFile:
  6022. * @filename: the filename
  6023. * @encoding: a free form C string describing the HTML document encoding, or NULL
  6024. *
  6025. * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
  6026. * compressed document is provided by default if found at compile-time.
  6027. *
  6028. * Returns the resulting document tree
  6029. */
  6030. htmlDocPtr
  6031. htmlParseFile(const char *filename, const char *encoding) {
  6032. return(htmlSAXParseFile(filename, encoding, NULL, NULL));
  6033. }
  6034. /**
  6035. * htmlHandleOmittedElem:
  6036. * @val: int 0 or 1
  6037. *
  6038. * Set and return the previous value for handling HTML omitted tags.
  6039. *
  6040. * Returns the last value for 0 for no handling, 1 for auto insertion.
  6041. */
  6042. int
  6043. htmlHandleOmittedElem(int val) {
  6044. int old = htmlOmittedDefaultValue;
  6045. htmlOmittedDefaultValue = val;
  6046. return(old);
  6047. }
  6048. /**
  6049. * htmlElementAllowedHere:
  6050. * @parent: HTML parent element
  6051. * @elt: HTML element
  6052. *
  6053. * Checks whether an HTML element may be a direct child of a parent element.
  6054. * Note - doesn't check for deprecated elements
  6055. *
  6056. * Returns 1 if allowed; 0 otherwise.
  6057. */
  6058. int
  6059. htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
  6060. const char** p ;
  6061. if ( ! elt || ! parent || ! parent->subelts )
  6062. return 0 ;
  6063. for ( p = parent->subelts; *p; ++p )
  6064. if ( !xmlStrcmp((const xmlChar *)*p, elt) )
  6065. return 1 ;
  6066. return 0 ;
  6067. }
  6068. /**
  6069. * htmlElementStatusHere:
  6070. * @parent: HTML parent element
  6071. * @elt: HTML element
  6072. *
  6073. * Checks whether an HTML element may be a direct child of a parent element.
  6074. * and if so whether it is valid or deprecated.
  6075. *
  6076. * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
  6077. */
  6078. htmlStatus
  6079. htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
  6080. if ( ! parent || ! elt )
  6081. return HTML_INVALID ;
  6082. if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
  6083. return HTML_INVALID ;
  6084. return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
  6085. }
  6086. /**
  6087. * htmlAttrAllowed:
  6088. * @elt: HTML element
  6089. * @attr: HTML attribute
  6090. * @legacy: whether to allow deprecated attributes
  6091. *
  6092. * Checks whether an attribute is valid for an element
  6093. * Has full knowledge of Required and Deprecated attributes
  6094. *
  6095. * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
  6096. */
  6097. htmlStatus
  6098. htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
  6099. const char** p ;
  6100. if ( !elt || ! attr )
  6101. return HTML_INVALID ;
  6102. if ( elt->attrs_req )
  6103. for ( p = elt->attrs_req; *p; ++p)
  6104. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  6105. return HTML_REQUIRED ;
  6106. if ( elt->attrs_opt )
  6107. for ( p = elt->attrs_opt; *p; ++p)
  6108. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  6109. return HTML_VALID ;
  6110. if ( legacy && elt->attrs_depr )
  6111. for ( p = elt->attrs_depr; *p; ++p)
  6112. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  6113. return HTML_DEPRECATED ;
  6114. return HTML_INVALID ;
  6115. }
  6116. /**
  6117. * htmlNodeStatus:
  6118. * @node: an htmlNodePtr in a tree
  6119. * @legacy: whether to allow deprecated elements (YES is faster here
  6120. * for Element nodes)
  6121. *
  6122. * Checks whether the tree node is valid. Experimental (the author
  6123. * only uses the HTML enhancements in a SAX parser)
  6124. *
  6125. * Return: for Element nodes, a return from htmlElementAllowedHere (if
  6126. * legacy allowed) or htmlElementStatusHere (otherwise).
  6127. * for Attribute nodes, a return from htmlAttrAllowed
  6128. * for other nodes, HTML_NA (no checks performed)
  6129. */
  6130. htmlStatus
  6131. htmlNodeStatus(const htmlNodePtr node, int legacy) {
  6132. if ( ! node )
  6133. return HTML_INVALID ;
  6134. switch ( node->type ) {
  6135. case XML_ELEMENT_NODE:
  6136. return legacy
  6137. ? ( htmlElementAllowedHere (
  6138. htmlTagLookup(node->parent->name) , node->name
  6139. ) ? HTML_VALID : HTML_INVALID )
  6140. : htmlElementStatusHere(
  6141. htmlTagLookup(node->parent->name) ,
  6142. htmlTagLookup(node->name) )
  6143. ;
  6144. case XML_ATTRIBUTE_NODE:
  6145. return htmlAttrAllowed(
  6146. htmlTagLookup(node->parent->name) , node->name, legacy) ;
  6147. default: return HTML_NA ;
  6148. }
  6149. }
  6150. /************************************************************************
  6151. * *
  6152. * New set (2.6.0) of simpler and more flexible APIs *
  6153. * *
  6154. ************************************************************************/
  6155. /**
  6156. * DICT_FREE:
  6157. * @str: a string
  6158. *
  6159. * Free a string if it is not owned by the "dict" dictionary in the
  6160. * current scope
  6161. */
  6162. #define DICT_FREE(str) \
  6163. if ((str) && ((!dict) || \
  6164. (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
  6165. xmlFree((char *)(str));
  6166. /**
  6167. * htmlCtxtReset:
  6168. * @ctxt: an HTML parser context
  6169. *
  6170. * Reset a parser context
  6171. */
  6172. void
  6173. htmlCtxtReset(htmlParserCtxtPtr ctxt)
  6174. {
  6175. xmlParserInputPtr input;
  6176. xmlDictPtr dict;
  6177. if (ctxt == NULL)
  6178. return;
  6179. xmlInitParser();
  6180. dict = ctxt->dict;
  6181. while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
  6182. xmlFreeInputStream(input);
  6183. }
  6184. ctxt->inputNr = 0;
  6185. ctxt->input = NULL;
  6186. ctxt->spaceNr = 0;
  6187. if (ctxt->spaceTab != NULL) {
  6188. ctxt->spaceTab[0] = -1;
  6189. ctxt->space = &ctxt->spaceTab[0];
  6190. } else {
  6191. ctxt->space = NULL;
  6192. }
  6193. ctxt->nodeNr = 0;
  6194. ctxt->node = NULL;
  6195. ctxt->nameNr = 0;
  6196. ctxt->name = NULL;
  6197. DICT_FREE(ctxt->version);
  6198. ctxt->version = NULL;
  6199. DICT_FREE(ctxt->encoding);
  6200. ctxt->encoding = NULL;
  6201. DICT_FREE(ctxt->directory);
  6202. ctxt->directory = NULL;
  6203. DICT_FREE(ctxt->extSubURI);
  6204. ctxt->extSubURI = NULL;
  6205. DICT_FREE(ctxt->extSubSystem);
  6206. ctxt->extSubSystem = NULL;
  6207. if (ctxt->myDoc != NULL)
  6208. xmlFreeDoc(ctxt->myDoc);
  6209. ctxt->myDoc = NULL;
  6210. ctxt->standalone = -1;
  6211. ctxt->hasExternalSubset = 0;
  6212. ctxt->hasPErefs = 0;
  6213. ctxt->html = 1;
  6214. ctxt->external = 0;
  6215. ctxt->instate = XML_PARSER_START;
  6216. ctxt->token = 0;
  6217. ctxt->wellFormed = 1;
  6218. ctxt->nsWellFormed = 1;
  6219. ctxt->disableSAX = 0;
  6220. ctxt->valid = 1;
  6221. ctxt->vctxt.userData = ctxt;
  6222. ctxt->vctxt.error = xmlParserValidityError;
  6223. ctxt->vctxt.warning = xmlParserValidityWarning;
  6224. ctxt->record_info = 0;
  6225. ctxt->checkIndex = 0;
  6226. ctxt->inSubset = 0;
  6227. ctxt->errNo = XML_ERR_OK;
  6228. ctxt->depth = 0;
  6229. ctxt->charset = XML_CHAR_ENCODING_NONE;
  6230. ctxt->catalogs = NULL;
  6231. xmlInitNodeInfoSeq(&ctxt->node_seq);
  6232. if (ctxt->attsDefault != NULL) {
  6233. xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
  6234. ctxt->attsDefault = NULL;
  6235. }
  6236. if (ctxt->attsSpecial != NULL) {
  6237. xmlHashFree(ctxt->attsSpecial, NULL);
  6238. ctxt->attsSpecial = NULL;
  6239. }
  6240. }
  6241. /**
  6242. * htmlCtxtUseOptions:
  6243. * @ctxt: an HTML parser context
  6244. * @options: a combination of htmlParserOption(s)
  6245. *
  6246. * Applies the options to the parser context
  6247. *
  6248. * Returns 0 in case of success, the set of unknown or unimplemented options
  6249. * in case of error.
  6250. */
  6251. int
  6252. htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
  6253. {
  6254. if (ctxt == NULL)
  6255. return(-1);
  6256. if (options & HTML_PARSE_NOWARNING) {
  6257. ctxt->sax->warning = NULL;
  6258. ctxt->vctxt.warning = NULL;
  6259. options -= XML_PARSE_NOWARNING;
  6260. ctxt->options |= XML_PARSE_NOWARNING;
  6261. }
  6262. if (options & HTML_PARSE_NOERROR) {
  6263. ctxt->sax->error = NULL;
  6264. ctxt->vctxt.error = NULL;
  6265. ctxt->sax->fatalError = NULL;
  6266. options -= XML_PARSE_NOERROR;
  6267. ctxt->options |= XML_PARSE_NOERROR;
  6268. }
  6269. if (options & HTML_PARSE_PEDANTIC) {
  6270. ctxt->pedantic = 1;
  6271. options -= XML_PARSE_PEDANTIC;
  6272. ctxt->options |= XML_PARSE_PEDANTIC;
  6273. } else
  6274. ctxt->pedantic = 0;
  6275. if (options & XML_PARSE_NOBLANKS) {
  6276. ctxt->keepBlanks = 0;
  6277. ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
  6278. options -= XML_PARSE_NOBLANKS;
  6279. ctxt->options |= XML_PARSE_NOBLANKS;
  6280. } else
  6281. ctxt->keepBlanks = 1;
  6282. if (options & HTML_PARSE_RECOVER) {
  6283. ctxt->recovery = 1;
  6284. options -= HTML_PARSE_RECOVER;
  6285. } else
  6286. ctxt->recovery = 0;
  6287. if (options & HTML_PARSE_COMPACT) {
  6288. ctxt->options |= HTML_PARSE_COMPACT;
  6289. options -= HTML_PARSE_COMPACT;
  6290. }
  6291. if (options & XML_PARSE_HUGE) {
  6292. ctxt->options |= XML_PARSE_HUGE;
  6293. options -= XML_PARSE_HUGE;
  6294. }
  6295. if (options & HTML_PARSE_NODEFDTD) {
  6296. ctxt->options |= HTML_PARSE_NODEFDTD;
  6297. options -= HTML_PARSE_NODEFDTD;
  6298. }
  6299. if (options & HTML_PARSE_IGNORE_ENC) {
  6300. ctxt->options |= HTML_PARSE_IGNORE_ENC;
  6301. options -= HTML_PARSE_IGNORE_ENC;
  6302. }
  6303. if (options & HTML_PARSE_NOIMPLIED) {
  6304. ctxt->options |= HTML_PARSE_NOIMPLIED;
  6305. options -= HTML_PARSE_NOIMPLIED;
  6306. }
  6307. ctxt->dictNames = 0;
  6308. return (options);
  6309. }
  6310. /**
  6311. * htmlDoRead:
  6312. * @ctxt: an HTML parser context
  6313. * @URL: the base URL to use for the document
  6314. * @encoding: the document encoding, or NULL
  6315. * @options: a combination of htmlParserOption(s)
  6316. * @reuse: keep the context for reuse
  6317. *
  6318. * Common front-end for the htmlRead functions
  6319. *
  6320. * Returns the resulting document tree or NULL
  6321. */
  6322. static htmlDocPtr
  6323. htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
  6324. int options, int reuse)
  6325. {
  6326. htmlDocPtr ret;
  6327. htmlCtxtUseOptions(ctxt, options);
  6328. ctxt->html = 1;
  6329. if (encoding != NULL) {
  6330. xmlCharEncodingHandlerPtr hdlr;
  6331. hdlr = xmlFindCharEncodingHandler(encoding);
  6332. if (hdlr != NULL) {
  6333. xmlSwitchToEncoding(ctxt, hdlr);
  6334. if (ctxt->input->encoding != NULL)
  6335. xmlFree((xmlChar *) ctxt->input->encoding);
  6336. ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
  6337. }
  6338. }
  6339. if ((URL != NULL) && (ctxt->input != NULL) &&
  6340. (ctxt->input->filename == NULL))
  6341. ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
  6342. htmlParseDocument(ctxt);
  6343. ret = ctxt->myDoc;
  6344. ctxt->myDoc = NULL;
  6345. if (!reuse) {
  6346. if ((ctxt->dictNames) &&
  6347. (ret != NULL) &&
  6348. (ret->dict == ctxt->dict))
  6349. ctxt->dict = NULL;
  6350. xmlFreeParserCtxt(ctxt);
  6351. }
  6352. return (ret);
  6353. }
  6354. /**
  6355. * htmlReadDoc:
  6356. * @cur: a pointer to a zero terminated string
  6357. * @URL: the base URL to use for the document
  6358. * @encoding: the document encoding, or NULL
  6359. * @options: a combination of htmlParserOption(s)
  6360. *
  6361. * parse an XML in-memory document and build a tree.
  6362. *
  6363. * Returns the resulting document tree
  6364. */
  6365. htmlDocPtr
  6366. htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
  6367. {
  6368. htmlParserCtxtPtr ctxt;
  6369. if (cur == NULL)
  6370. return (NULL);
  6371. xmlInitParser();
  6372. ctxt = htmlCreateDocParserCtxt(cur, NULL);
  6373. if (ctxt == NULL)
  6374. return (NULL);
  6375. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6376. }
  6377. /**
  6378. * htmlReadFile:
  6379. * @filename: a file or URL
  6380. * @encoding: the document encoding, or NULL
  6381. * @options: a combination of htmlParserOption(s)
  6382. *
  6383. * parse an XML file from the filesystem or the network.
  6384. *
  6385. * Returns the resulting document tree
  6386. */
  6387. htmlDocPtr
  6388. htmlReadFile(const char *filename, const char *encoding, int options)
  6389. {
  6390. htmlParserCtxtPtr ctxt;
  6391. xmlInitParser();
  6392. ctxt = htmlCreateFileParserCtxt(filename, encoding);
  6393. if (ctxt == NULL)
  6394. return (NULL);
  6395. return (htmlDoRead(ctxt, NULL, NULL, options, 0));
  6396. }
  6397. /**
  6398. * htmlReadMemory:
  6399. * @buffer: a pointer to a char array
  6400. * @size: the size of the array
  6401. * @URL: the base URL to use for the document
  6402. * @encoding: the document encoding, or NULL
  6403. * @options: a combination of htmlParserOption(s)
  6404. *
  6405. * parse an XML in-memory document and build a tree.
  6406. *
  6407. * Returns the resulting document tree
  6408. */
  6409. htmlDocPtr
  6410. htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
  6411. {
  6412. htmlParserCtxtPtr ctxt;
  6413. xmlInitParser();
  6414. ctxt = xmlCreateMemoryParserCtxt(buffer, size);
  6415. if (ctxt == NULL)
  6416. return (NULL);
  6417. htmlDefaultSAXHandlerInit();
  6418. if (ctxt->sax != NULL)
  6419. memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
  6420. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6421. }
  6422. /**
  6423. * htmlReadFd:
  6424. * @fd: an open file descriptor
  6425. * @URL: the base URL to use for the document
  6426. * @encoding: the document encoding, or NULL
  6427. * @options: a combination of htmlParserOption(s)
  6428. *
  6429. * parse an HTML from a file descriptor and build a tree.
  6430. * NOTE that the file descriptor will not be closed when the
  6431. * reader is closed or reset.
  6432. *
  6433. * Returns the resulting document tree
  6434. */
  6435. htmlDocPtr
  6436. htmlReadFd(int fd, const char *URL, const char *encoding, int options)
  6437. {
  6438. htmlParserCtxtPtr ctxt;
  6439. xmlParserInputBufferPtr input;
  6440. htmlParserInputPtr stream;
  6441. if (fd < 0)
  6442. return (NULL);
  6443. xmlInitParser();
  6444. input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
  6445. if (input == NULL)
  6446. return (NULL);
  6447. input->closecallback = NULL;
  6448. ctxt = htmlNewParserCtxt();
  6449. if (ctxt == NULL) {
  6450. xmlFreeParserInputBuffer(input);
  6451. return (NULL);
  6452. }
  6453. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6454. if (stream == NULL) {
  6455. xmlFreeParserInputBuffer(input);
  6456. htmlFreeParserCtxt(ctxt);
  6457. return (NULL);
  6458. }
  6459. inputPush(ctxt, stream);
  6460. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6461. }
  6462. /**
  6463. * htmlReadIO:
  6464. * @ioread: an I/O read function
  6465. * @ioclose: an I/O close function
  6466. * @ioctx: an I/O handler
  6467. * @URL: the base URL to use for the document
  6468. * @encoding: the document encoding, or NULL
  6469. * @options: a combination of htmlParserOption(s)
  6470. *
  6471. * parse an HTML document from I/O functions and source and build a tree.
  6472. *
  6473. * Returns the resulting document tree
  6474. */
  6475. htmlDocPtr
  6476. htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
  6477. void *ioctx, const char *URL, const char *encoding, int options)
  6478. {
  6479. htmlParserCtxtPtr ctxt;
  6480. xmlParserInputBufferPtr input;
  6481. xmlParserInputPtr stream;
  6482. if (ioread == NULL)
  6483. return (NULL);
  6484. xmlInitParser();
  6485. input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
  6486. XML_CHAR_ENCODING_NONE);
  6487. if (input == NULL) {
  6488. if (ioclose != NULL)
  6489. ioclose(ioctx);
  6490. return (NULL);
  6491. }
  6492. ctxt = htmlNewParserCtxt();
  6493. if (ctxt == NULL) {
  6494. xmlFreeParserInputBuffer(input);
  6495. return (NULL);
  6496. }
  6497. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6498. if (stream == NULL) {
  6499. xmlFreeParserInputBuffer(input);
  6500. xmlFreeParserCtxt(ctxt);
  6501. return (NULL);
  6502. }
  6503. inputPush(ctxt, stream);
  6504. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  6505. }
  6506. /**
  6507. * htmlCtxtReadDoc:
  6508. * @ctxt: an HTML parser context
  6509. * @cur: a pointer to a zero terminated string
  6510. * @URL: the base URL to use for the document
  6511. * @encoding: the document encoding, or NULL
  6512. * @options: a combination of htmlParserOption(s)
  6513. *
  6514. * parse an XML in-memory document and build a tree.
  6515. * This reuses the existing @ctxt parser context
  6516. *
  6517. * Returns the resulting document tree
  6518. */
  6519. htmlDocPtr
  6520. htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
  6521. const char *URL, const char *encoding, int options)
  6522. {
  6523. xmlParserInputPtr stream;
  6524. if (cur == NULL)
  6525. return (NULL);
  6526. if (ctxt == NULL)
  6527. return (NULL);
  6528. xmlInitParser();
  6529. htmlCtxtReset(ctxt);
  6530. stream = xmlNewStringInputStream(ctxt, cur);
  6531. if (stream == NULL) {
  6532. return (NULL);
  6533. }
  6534. inputPush(ctxt, stream);
  6535. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6536. }
  6537. /**
  6538. * htmlCtxtReadFile:
  6539. * @ctxt: an HTML parser context
  6540. * @filename: a file or URL
  6541. * @encoding: the document encoding, or NULL
  6542. * @options: a combination of htmlParserOption(s)
  6543. *
  6544. * parse an XML file from the filesystem or the network.
  6545. * This reuses the existing @ctxt parser context
  6546. *
  6547. * Returns the resulting document tree
  6548. */
  6549. htmlDocPtr
  6550. htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
  6551. const char *encoding, int options)
  6552. {
  6553. xmlParserInputPtr stream;
  6554. if (filename == NULL)
  6555. return (NULL);
  6556. if (ctxt == NULL)
  6557. return (NULL);
  6558. xmlInitParser();
  6559. htmlCtxtReset(ctxt);
  6560. stream = xmlLoadExternalEntity(filename, NULL, ctxt);
  6561. if (stream == NULL) {
  6562. return (NULL);
  6563. }
  6564. inputPush(ctxt, stream);
  6565. return (htmlDoRead(ctxt, NULL, encoding, options, 1));
  6566. }
  6567. /**
  6568. * htmlCtxtReadMemory:
  6569. * @ctxt: an HTML parser context
  6570. * @buffer: a pointer to a char array
  6571. * @size: the size of the array
  6572. * @URL: the base URL to use for the document
  6573. * @encoding: the document encoding, or NULL
  6574. * @options: a combination of htmlParserOption(s)
  6575. *
  6576. * parse an XML in-memory document and build a tree.
  6577. * This reuses the existing @ctxt parser context
  6578. *
  6579. * Returns the resulting document tree
  6580. */
  6581. htmlDocPtr
  6582. htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
  6583. const char *URL, const char *encoding, int options)
  6584. {
  6585. xmlParserInputBufferPtr input;
  6586. xmlParserInputPtr stream;
  6587. if (ctxt == NULL)
  6588. return (NULL);
  6589. if (buffer == NULL)
  6590. return (NULL);
  6591. xmlInitParser();
  6592. htmlCtxtReset(ctxt);
  6593. input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
  6594. if (input == NULL) {
  6595. return(NULL);
  6596. }
  6597. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6598. if (stream == NULL) {
  6599. xmlFreeParserInputBuffer(input);
  6600. return(NULL);
  6601. }
  6602. inputPush(ctxt, stream);
  6603. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6604. }
  6605. /**
  6606. * htmlCtxtReadFd:
  6607. * @ctxt: an HTML parser context
  6608. * @fd: an open file descriptor
  6609. * @URL: the base URL to use for the document
  6610. * @encoding: the document encoding, or NULL
  6611. * @options: a combination of htmlParserOption(s)
  6612. *
  6613. * parse an XML from a file descriptor and build a tree.
  6614. * This reuses the existing @ctxt parser context
  6615. *
  6616. * Returns the resulting document tree
  6617. */
  6618. htmlDocPtr
  6619. htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
  6620. const char *URL, const char *encoding, int options)
  6621. {
  6622. xmlParserInputBufferPtr input;
  6623. xmlParserInputPtr stream;
  6624. if (fd < 0)
  6625. return (NULL);
  6626. if (ctxt == NULL)
  6627. return (NULL);
  6628. xmlInitParser();
  6629. htmlCtxtReset(ctxt);
  6630. input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
  6631. if (input == NULL)
  6632. return (NULL);
  6633. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6634. if (stream == NULL) {
  6635. xmlFreeParserInputBuffer(input);
  6636. return (NULL);
  6637. }
  6638. inputPush(ctxt, stream);
  6639. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6640. }
  6641. /**
  6642. * htmlCtxtReadIO:
  6643. * @ctxt: an HTML parser context
  6644. * @ioread: an I/O read function
  6645. * @ioclose: an I/O close function
  6646. * @ioctx: an I/O handler
  6647. * @URL: the base URL to use for the document
  6648. * @encoding: the document encoding, or NULL
  6649. * @options: a combination of htmlParserOption(s)
  6650. *
  6651. * parse an HTML document from I/O functions and source and build a tree.
  6652. * This reuses the existing @ctxt parser context
  6653. *
  6654. * Returns the resulting document tree
  6655. */
  6656. htmlDocPtr
  6657. htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
  6658. xmlInputCloseCallback ioclose, void *ioctx,
  6659. const char *URL,
  6660. const char *encoding, int options)
  6661. {
  6662. xmlParserInputBufferPtr input;
  6663. xmlParserInputPtr stream;
  6664. if (ioread == NULL)
  6665. return (NULL);
  6666. if (ctxt == NULL)
  6667. return (NULL);
  6668. xmlInitParser();
  6669. htmlCtxtReset(ctxt);
  6670. input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
  6671. XML_CHAR_ENCODING_NONE);
  6672. if (input == NULL) {
  6673. if (ioclose != NULL)
  6674. ioclose(ioctx);
  6675. return (NULL);
  6676. }
  6677. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6678. if (stream == NULL) {
  6679. xmlFreeParserInputBuffer(input);
  6680. return (NULL);
  6681. }
  6682. inputPush(ctxt, stream);
  6683. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6684. }
  6685. #define bottom_HTMLparser
  6686. #include "elfgcchack.h"
  6687. #endif /* LIBXML_HTML_ENABLED */