xml-textreader.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. #pragma once
  2. #include "xml-document.h"
  3. #include "xml-options.h"
  4. #include <contrib/libs/libxml/include/libxml/xmlreader.h>
  5. #include <library/cpp/string_utils/ztstrbuf/ztstrbuf.h>
  6. #include <util/generic/noncopyable.h>
  7. #include <util/generic/ptr.h>
  8. #include <util/generic/strbuf.h>
  9. #include <util/generic/string.h>
  10. #include <functional>
  11. #include <util/stream/input.h>
  12. #include <util/stream/str.h>
  13. namespace NXml {
  14. /**
  15. * TextReader Parser
  16. *
  17. * API of the XML streaming API based on C# interfaces.
  18. * Provides fast, non-cached, forward-only access to XML data.
  19. *
  20. * Like the SAX parser, the TextReader parser is suitable for sequential
  21. * parsing, but instead of implementing handlers for specific parts of the
  22. * document, it allows you to detect the current node type, process the node
  23. * accordingly, and skip forward in the document as much as necessary.
  24. *
  25. * Unlike the DOM parser, you may not move backwards in the XML document.
  26. * And unlike the SAX parser, you must not waste time processing nodes that do not
  27. * interest you.
  28. *
  29. * All methods are on the single parser instance, but their result depends on the current context.
  30. * For instance, use Read() to move to the next node, and MoveToElement() to navigate to child nodes.
  31. * These methods will return false when no more nodes are available. Then use
  32. * methods such as GetName() and GetValue() to examine the elements and their attributes.
  33. *
  34. * This wrapper is inspired by TextReader from libxml++.
  35. */
  36. class TTextReader: private TNonCopyable {
  37. public:
  38. // strongly-typed alias for enum from xmlreader.h
  39. enum class ENodeType : int {
  40. // clang-format off
  41. Attribute = XML_READER_TYPE_ATTRIBUTE,
  42. CDATA = XML_READER_TYPE_CDATA,
  43. Comment = XML_READER_TYPE_COMMENT,
  44. Document = XML_READER_TYPE_DOCUMENT,
  45. DocumentFragment = XML_READER_TYPE_DOCUMENT_FRAGMENT,
  46. DocumentType = XML_READER_TYPE_DOCUMENT_TYPE,
  47. Element = XML_READER_TYPE_ELEMENT,
  48. EndElement = XML_READER_TYPE_END_ELEMENT,
  49. EndEntity = XML_READER_TYPE_END_ENTITY,
  50. Entity = XML_READER_TYPE_ENTITY,
  51. EntityReference = XML_READER_TYPE_ENTITY_REFERENCE,
  52. None = XML_READER_TYPE_NONE,
  53. Notation = XML_READER_TYPE_NOTATION,
  54. ProcessingInstruction = XML_READER_TYPE_PROCESSING_INSTRUCTION,
  55. SignificantWhitespace = XML_READER_TYPE_SIGNIFICANT_WHITESPACE,
  56. Text = XML_READER_TYPE_TEXT,
  57. Whitespace = XML_READER_TYPE_WHITESPACE,
  58. XmlDeclaration = XML_READER_TYPE_XML_DECLARATION,
  59. // clang-format on
  60. };
  61. enum class EReadState : int {
  62. // clang-format off
  63. Closed = XML_TEXTREADER_MODE_CLOSED,
  64. EndOfFile = XML_TEXTREADER_MODE_EOF,
  65. Error = XML_TEXTREADER_MODE_ERROR,
  66. Initial = XML_TEXTREADER_MODE_INITIAL,
  67. Interactive = XML_TEXTREADER_MODE_INTERACTIVE,
  68. Reading = XML_TEXTREADER_MODE_READING,
  69. // clang-format on
  70. };
  71. public:
  72. TTextReader(IInputStream& stream, const TOptions& options = TOptions());
  73. ~TTextReader();
  74. /**
  75. * Moves the position of the current instance to the next node in the stream, exposing its properties.
  76. * @return true if the node was read successfully, false if there are no more nodes to read
  77. */
  78. bool Read();
  79. /**
  80. * Reads the contents of the current node, including child nodes and markup.
  81. * @return A string containing the XML content, or an empty string
  82. * if the current node is neither an element nor attribute, or has no child nodes
  83. */
  84. TString ReadInnerXml() const;
  85. /**
  86. * Reads the current node and its contents, including child nodes and markup.
  87. * @return A string containing the XML content, or an empty string
  88. * if the current node is neither an element nor attribute
  89. */
  90. TString ReadOuterXml() const;
  91. /**
  92. * Reads the contents of an element or a text node as a string.
  93. * @return A string containing the contents of the Element or Text node,
  94. * or an empty string if the reader is positioned on any other type of node
  95. */
  96. TString ReadString() const;
  97. /**
  98. * Parses an attribute value into one or more Text and EntityReference nodes.
  99. * @return A bool where true indicates the attribute value was parsed,
  100. * and false indicates the reader was not positioned on an attribute node
  101. * or all the attribute values have been read
  102. */
  103. bool ReadAttributeValue() const;
  104. /**
  105. * Gets the number of attributes on the current node.
  106. * @return The number of attributes on the current node, or zero if the current node
  107. * does not support attributes
  108. */
  109. int GetAttributeCount() const;
  110. /**
  111. * Gets the base Uniform Resource Identifier (URI) of the current node.
  112. * @return The base URI of the current node or an empty string if not available
  113. */
  114. TStringBuf GetBaseUri() const;
  115. /**
  116. * Gets the depth of the current node in the XML document.
  117. * @return The depth of the current node in the XML document
  118. */
  119. int GetDepth() const;
  120. /**
  121. * Gets a value indicating whether the current node has any attributes.
  122. * @return true if the current has attributes, false otherwise
  123. */
  124. bool HasAttributes() const;
  125. /**
  126. * Whether the node can have a text value.
  127. * @return true if the current node can have an associated text value, false otherwise
  128. */
  129. bool HasValue() const;
  130. /**
  131. * Whether an Attribute node was generated from the default value defined in the DTD or schema.
  132. * @return true if defaulted, false otherwise
  133. */
  134. bool IsDefault() const;
  135. /**
  136. * Check if the current node is empty.
  137. * @return true if empty, false otherwise
  138. */
  139. bool IsEmptyElement() const;
  140. /**
  141. * The local name of the node.
  142. * @return the local name or empty string if not available
  143. */
  144. TStringBuf GetLocalName() const;
  145. /**
  146. * The qualified name of the node, equal to Prefix:LocalName.
  147. * @return the name or empty string if not available
  148. */
  149. TStringBuf GetName() const;
  150. /**
  151. * The URI defining the namespace associated with the node.
  152. * @return the namespace URI or empty string if not available
  153. */
  154. TStringBuf GetNamespaceUri() const;
  155. /**
  156. * Get the node type of the current node.
  157. * @return the ENodeType of the current node
  158. */
  159. ENodeType GetNodeType() const;
  160. /**
  161. * Get the namespace prefix associated with the current node.
  162. * @return the namespace prefix, or an empty string if not available
  163. */
  164. TStringBuf GetPrefix() const;
  165. /**
  166. * Get the quotation mark character used to enclose the value of an attribute.
  167. * @return " or '
  168. */
  169. char GetQuoteChar() const;
  170. /**
  171. * Provides the text value of the node if present.
  172. * @return the string or empty if not available
  173. */
  174. TStringBuf GetValue() const;
  175. /**
  176. * Gets the read state of the reader.
  177. * @return the state value
  178. */
  179. EReadState GetReadState() const;
  180. /**
  181. * This method releases any resources allocated by the current instance
  182. * changes the state to Closed and close any underlying input.
  183. */
  184. void Close();
  185. /**
  186. * Provides the value of the attribute with the specified index relative to the containing element.
  187. * @param number the zero-based index of the attribute relative to the containing element
  188. */
  189. TString GetAttribute(int number) const;
  190. /**
  191. * Provides the value of the attribute with the specified qualified name.
  192. * @param name the qualified name of the attribute
  193. */
  194. TString GetAttribute(TZtStringBuf name) const;
  195. /**
  196. * Provides the value of the specified attribute.
  197. * @param localName the local name of the attribute
  198. * @param nsUri the namespace URI of the attribute
  199. */
  200. TString GetAttribute(TZtStringBuf localName, TZtStringBuf nsUri) const;
  201. /**
  202. * Resolves a namespace prefix in the scope of the current element.
  203. * @param prefix the prefix whose namespace URI is to be resolved. To return the default namespace, specify empty string.
  204. * @return a string containing the namespace URI to which the prefix maps.
  205. */
  206. TString LookupNamespace(TZtStringBuf prefix) const;
  207. /**
  208. * Moves the position of the current instance to the attribute with the specified index relative to the containing element.
  209. * @param number the zero-based index of the attribute relative to the containing element
  210. * @return true in case of success, false if not found
  211. */
  212. bool MoveToAttribute(int number);
  213. /**
  214. * Moves the position of the current instance to the attribute with the specified qualified name.
  215. * @param name the qualified name of the attribute
  216. * @return true in case of success, false if not found
  217. */
  218. bool MoveToAttribute(TZtStringBuf name);
  219. /**
  220. * Moves the position of the current instance to the attribute with the specified local name and namespace URI.
  221. * @param localName the local name of the attribute
  222. * @param nsUri the namespace URI of the attribute
  223. * @return true in case of success, false if not found
  224. */
  225. bool MoveToAttribute(TZtStringBuf localName, TZtStringBuf nsUri);
  226. /**
  227. * Moves the position of the current instance to the first attribute associated with the current node.
  228. * @return true in case of success, false if not found
  229. */
  230. bool MoveToFirstAttribute();
  231. /**
  232. * Moves the position of the current instance to the next attribute associated with the current node.
  233. * @return true in case of success, false if not found
  234. */
  235. bool MoveToNextAttribute();
  236. /**
  237. * Moves the position of the current instance to the node that contains the current Attribute node.
  238. * @return true in case of success, false if not found
  239. */
  240. bool MoveToElement();
  241. /**
  242. * Reads the contents of the current node and the full subtree. It then makes the subtree available until the next Read() call.
  243. */
  244. TConstNode Expand() const;
  245. /**
  246. * Skip to the node following the current one in document order while avoiding the subtree if any.
  247. * @return true if the node was read successfully, false if there is no more nodes to read
  248. */
  249. bool Next();
  250. /**
  251. * Retrieve the validity status from the parser context.
  252. */
  253. bool IsValid() const;
  254. private:
  255. static int ReadFromInputStreamCallback(void* context, char* buffer, int len);
  256. static void OnLibxmlError(void* arg, const char* msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator);
  257. void SetupErrorHandler();
  258. TStringStream& LogError() const;
  259. void CheckForExceptions() const;
  260. void ThrowException() const;
  261. // helpers that check return codes of C functions from libxml
  262. bool BoolResult(int value) const;
  263. int IntResult(int value) const;
  264. char CharResult(int value) const;
  265. TStringBuf ConstStringResult(const xmlChar* value) const;
  266. TStringBuf ConstStringOrEmptyResult(const xmlChar* value) const;
  267. TString TempStringResult(TCharPtr value) const;
  268. TString TempStringOrEmptyResult(TCharPtr value) const;
  269. private:
  270. IInputStream& Stream;
  271. mutable bool IsError;
  272. mutable TStringStream ErrorBuffer;
  273. struct TDeleter;
  274. THolder<xmlTextReader, TDeleter> Impl;
  275. };
  276. }