scanner.py 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449
  1. # SPDX-License-Identifier: MIT
  2. # Scanner produces tokens of the following types:
  3. # STREAM-START
  4. # STREAM-END
  5. # DIRECTIVE(name, value)
  6. # DOCUMENT-START
  7. # DOCUMENT-END
  8. # BLOCK-SEQUENCE-START
  9. # BLOCK-MAPPING-START
  10. # BLOCK-END
  11. # FLOW-SEQUENCE-START
  12. # FLOW-MAPPING-START
  13. # FLOW-SEQUENCE-END
  14. # FLOW-MAPPING-END
  15. # BLOCK-ENTRY
  16. # FLOW-ENTRY
  17. # KEY
  18. # VALUE
  19. # ALIAS(value)
  20. # ANCHOR(value)
  21. # TAG(value)
  22. # SCALAR(value, plain, style)
  23. #
  24. # Read comments in the Scanner code for more details.
  25. #
  26. __all__ = ['Scanner', 'ScannerError']
  27. from .error import MarkedYAMLError
  28. from .tokens import *
  29. class ScannerError(MarkedYAMLError):
  30. pass
  31. class SimpleKey:
  32. # See below simple keys treatment.
  33. def __init__(self, token_number, required, index, line, column, mark):
  34. self.token_number = token_number
  35. self.required = required
  36. self.index = index
  37. self.line = line
  38. self.column = column
  39. self.mark = mark
  40. class Scanner:
  41. def __init__(self):
  42. """Initialize the scanner."""
  43. # It is assumed that Scanner and Reader will have a common descendant.
  44. # Reader do the dirty work of checking for BOM and converting the
  45. # input data to Unicode. It also adds NUL to the end.
  46. #
  47. # Reader supports the following methods
  48. # self.peek(i=0) # peek the next i-th character
  49. # self.prefix(l=1) # peek the next l characters
  50. # self.forward(l=1) # read the next l characters and move the pointer.
  51. # Had we reached the end of the stream?
  52. self.done = False
  53. # The number of unclosed '{' and '['. `flow_level == 0` means block
  54. # context.
  55. self.flow_level = 0
  56. # List of processed tokens that are not yet emitted.
  57. self.tokens = []
  58. # Add the STREAM-START token.
  59. self.fetch_stream_start()
  60. # Number of tokens that were emitted through the `get_token` method.
  61. self.tokens_taken = 0
  62. # The current indentation level.
  63. self.indent = -1
  64. # Past indentation levels.
  65. self.indents = []
  66. # Variables related to simple keys treatment.
  67. # A simple key is a key that is not denoted by the '?' indicator.
  68. # Example of simple keys:
  69. # ---
  70. # block simple key: value
  71. # ? not a simple key:
  72. # : { flow simple key: value }
  73. # We emit the KEY token before all keys, so when we find a potential
  74. # simple key, we try to locate the corresponding ':' indicator.
  75. # Simple keys should be limited to a single line and 1024 characters.
  76. # Can a simple key start at the current position? A simple key may
  77. # start:
  78. # - at the beginning of the line, not counting indentation spaces
  79. # (in block context),
  80. # - after '{', '[', ',' (in the flow context),
  81. # - after '?', ':', '-' (in the block context).
  82. # In the block context, this flag also signifies if a block collection
  83. # may start at the current position.
  84. self.allow_simple_key = True
  85. # Keep track of possible simple keys. This is a dictionary. The key
  86. # is `flow_level`; there can be no more that one possible simple key
  87. # for each level. The value is a SimpleKey record:
  88. # (token_number, required, index, line, column, mark)
  89. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  90. # '[', or '{' tokens.
  91. self.possible_simple_keys = {}
  92. # Public methods.
  93. def check_token(self, *choices):
  94. # Check if the next token is one of the given types.
  95. while self.need_more_tokens():
  96. self.fetch_more_tokens()
  97. if self.tokens:
  98. if not choices:
  99. return True
  100. for choice in choices:
  101. if isinstance(self.tokens[0], choice):
  102. return True
  103. return False
  104. def peek_token(self):
  105. # Return the next token, but do not delete if from the queue.
  106. while self.need_more_tokens():
  107. self.fetch_more_tokens()
  108. if self.tokens:
  109. return self.tokens[0]
  110. def get_token(self):
  111. # Return the next token.
  112. while self.need_more_tokens():
  113. self.fetch_more_tokens()
  114. if self.tokens:
  115. self.tokens_taken += 1
  116. return self.tokens.pop(0)
  117. # Private methods.
  118. def need_more_tokens(self):
  119. if self.done:
  120. return False
  121. if not self.tokens:
  122. return True
  123. # The current token may be a potential simple key, so we
  124. # need to look further.
  125. self.stale_possible_simple_keys()
  126. if self.next_possible_simple_key() == self.tokens_taken:
  127. return True
  128. def fetch_more_tokens(self):
  129. # Eat whitespaces and comments until we reach the next token.
  130. self.scan_to_next_token()
  131. # Remove obsolete possible simple keys.
  132. self.stale_possible_simple_keys()
  133. # Compare the current indentation and column. It may add some tokens
  134. # and decrease the current indentation level.
  135. self.unwind_indent(self.column)
  136. # Peek the next character.
  137. ch = self.peek()
  138. # Is it the end of stream?
  139. if ch == '\0':
  140. return self.fetch_stream_end()
  141. # Is it a directive?
  142. if ch == '%' and self.check_directive():
  143. return self.fetch_directive()
  144. # Is it the document start?
  145. if ch == '-' and self.check_document_start():
  146. return self.fetch_document_start()
  147. # Is it the document end?
  148. if ch == '.' and self.check_document_end():
  149. return self.fetch_document_end()
  150. # TODO: support for BOM within a stream.
  151. #if ch == '\uFEFF':
  152. # return self.fetch_bom() <-- issue BOMToken
  153. # Note: the order of the following checks is NOT significant.
  154. # Is it the flow sequence start indicator?
  155. if ch == '[':
  156. return self.fetch_flow_sequence_start()
  157. # Is it the flow mapping start indicator?
  158. if ch == '{':
  159. return self.fetch_flow_mapping_start()
  160. # Is it the flow sequence end indicator?
  161. if ch == ']':
  162. return self.fetch_flow_sequence_end()
  163. # Is it the flow mapping end indicator?
  164. if ch == '}':
  165. return self.fetch_flow_mapping_end()
  166. # Is it the flow entry indicator?
  167. if ch == ',':
  168. return self.fetch_flow_entry()
  169. # Is it the block entry indicator?
  170. if ch == '-' and self.check_block_entry():
  171. return self.fetch_block_entry()
  172. # Is it the key indicator?
  173. if ch == '?' and self.check_key():
  174. return self.fetch_key()
  175. # Is it the value indicator?
  176. if ch == ':' and self.check_value():
  177. return self.fetch_value()
  178. # Is it an alias?
  179. if ch == '*':
  180. return self.fetch_alias()
  181. # Is it an anchor?
  182. if ch == '&':
  183. return self.fetch_anchor()
  184. # Is it a tag?
  185. if ch == '!':
  186. return self.fetch_tag()
  187. # Is it a literal scalar?
  188. if ch == '|' and not self.flow_level:
  189. return self.fetch_literal()
  190. # Is it a folded scalar?
  191. if ch == '>' and not self.flow_level:
  192. return self.fetch_folded()
  193. # Is it a single quoted scalar?
  194. if ch == '\'':
  195. return self.fetch_single()
  196. # Is it a double quoted scalar?
  197. if ch == '\"':
  198. return self.fetch_double()
  199. # It must be a plain scalar then.
  200. if self.check_plain():
  201. return self.fetch_plain()
  202. # No? It's an error. Let's produce a nice error message.
  203. raise ScannerError("while scanning for the next token", None,
  204. "found character %r that cannot start any token" % ch,
  205. self.get_mark())
  206. # Simple keys treatment.
  207. def next_possible_simple_key(self):
  208. # Return the number of the nearest possible simple key. Actually we
  209. # don't need to loop through the whole dictionary. We may replace it
  210. # with the following code:
  211. # if not self.possible_simple_keys:
  212. # return None
  213. # return self.possible_simple_keys[
  214. # min(self.possible_simple_keys.keys())].token_number
  215. min_token_number = None
  216. for level in self.possible_simple_keys:
  217. key = self.possible_simple_keys[level]
  218. if min_token_number is None or key.token_number < min_token_number:
  219. min_token_number = key.token_number
  220. return min_token_number
  221. def stale_possible_simple_keys(self):
  222. # Remove entries that are no longer possible simple keys. According to
  223. # the YAML specification, simple keys
  224. # - should be limited to a single line,
  225. # - should be no longer than 1024 characters.
  226. # Disabling this procedure will allow simple keys of any length and
  227. # height (may cause problems if indentation is broken though).
  228. for level in list(self.possible_simple_keys):
  229. key = self.possible_simple_keys[level]
  230. if key.line != self.line \
  231. or self.index-key.index > 1024:
  232. if key.required:
  233. raise ScannerError("while scanning a simple key", key.mark,
  234. "could not found expected ':'", self.get_mark())
  235. del self.possible_simple_keys[level]
  236. def save_possible_simple_key(self):
  237. # The next token may start a simple key. We check if it's possible
  238. # and save its position. This function is called for
  239. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  240. # Check if a simple key is required at the current position.
  241. required = not self.flow_level and self.indent == self.column
  242. # A simple key is required only if it is the first token in the current
  243. # line. Therefore it is always allowed.
  244. assert self.allow_simple_key or not required
  245. # The next token might be a simple key. Let's save it's number and
  246. # position.
  247. if self.allow_simple_key:
  248. self.remove_possible_simple_key()
  249. token_number = self.tokens_taken+len(self.tokens)
  250. key = SimpleKey(token_number, required,
  251. self.index, self.line, self.column, self.get_mark())
  252. self.possible_simple_keys[self.flow_level] = key
  253. def remove_possible_simple_key(self):
  254. # Remove the saved possible key position at the current flow level.
  255. if self.flow_level in self.possible_simple_keys:
  256. key = self.possible_simple_keys[self.flow_level]
  257. if key.required:
  258. raise ScannerError("while scanning a simple key", key.mark,
  259. "could not found expected ':'", self.get_mark())
  260. del self.possible_simple_keys[self.flow_level]
  261. # Indentation functions.
  262. def unwind_indent(self, column):
  263. ## In flow context, tokens should respect indentation.
  264. ## Actually the condition should be `self.indent >= column` according to
  265. ## the spec. But this condition will prohibit intuitively correct
  266. ## constructions such as
  267. ## key : {
  268. ## }
  269. #if self.flow_level and self.indent > column:
  270. # raise ScannerError(None, None,
  271. # "invalid intendation or unclosed '[' or '{'",
  272. # self.get_mark())
  273. # In the flow context, indentation is ignored. We make the scanner less
  274. # restrictive then specification requires.
  275. if self.flow_level:
  276. return
  277. # In block context, we may need to issue the BLOCK-END tokens.
  278. while self.indent > column:
  279. mark = self.get_mark()
  280. self.indent = self.indents.pop()
  281. self.tokens.append(BlockEndToken(mark, mark))
  282. def add_indent(self, column):
  283. # Check if we need to increase indentation.
  284. if self.indent < column:
  285. self.indents.append(self.indent)
  286. self.indent = column
  287. return True
  288. return False
  289. # Fetchers.
  290. def fetch_stream_start(self):
  291. # We always add STREAM-START as the first token and STREAM-END as the
  292. # last token.
  293. # Read the token.
  294. mark = self.get_mark()
  295. # Add STREAM-START.
  296. self.tokens.append(StreamStartToken(mark, mark,
  297. encoding=self.encoding))
  298. def fetch_stream_end(self):
  299. # Set the current intendation to -1.
  300. self.unwind_indent(-1)
  301. # Reset simple keys.
  302. self.remove_possible_simple_key()
  303. self.allow_simple_key = False
  304. self.possible_simple_keys = {}
  305. # Read the token.
  306. mark = self.get_mark()
  307. # Add STREAM-END.
  308. self.tokens.append(StreamEndToken(mark, mark))
  309. # The steam is finished.
  310. self.done = True
  311. def fetch_directive(self):
  312. # Set the current intendation to -1.
  313. self.unwind_indent(-1)
  314. # Reset simple keys.
  315. self.remove_possible_simple_key()
  316. self.allow_simple_key = False
  317. # Scan and add DIRECTIVE.
  318. self.tokens.append(self.scan_directive())
  319. def fetch_document_start(self):
  320. self.fetch_document_indicator(DocumentStartToken)
  321. def fetch_document_end(self):
  322. self.fetch_document_indicator(DocumentEndToken)
  323. def fetch_document_indicator(self, TokenClass):
  324. # Set the current intendation to -1.
  325. self.unwind_indent(-1)
  326. # Reset simple keys. Note that there could not be a block collection
  327. # after '---'.
  328. self.remove_possible_simple_key()
  329. self.allow_simple_key = False
  330. # Add DOCUMENT-START or DOCUMENT-END.
  331. start_mark = self.get_mark()
  332. self.forward(3)
  333. end_mark = self.get_mark()
  334. self.tokens.append(TokenClass(start_mark, end_mark))
  335. def fetch_flow_sequence_start(self):
  336. self.fetch_flow_collection_start(FlowSequenceStartToken)
  337. def fetch_flow_mapping_start(self):
  338. self.fetch_flow_collection_start(FlowMappingStartToken)
  339. def fetch_flow_collection_start(self, TokenClass):
  340. # '[' and '{' may start a simple key.
  341. self.save_possible_simple_key()
  342. # Increase the flow level.
  343. self.flow_level += 1
  344. # Simple keys are allowed after '[' and '{'.
  345. self.allow_simple_key = True
  346. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  347. start_mark = self.get_mark()
  348. self.forward()
  349. end_mark = self.get_mark()
  350. self.tokens.append(TokenClass(start_mark, end_mark))
  351. def fetch_flow_sequence_end(self):
  352. self.fetch_flow_collection_end(FlowSequenceEndToken)
  353. def fetch_flow_mapping_end(self):
  354. self.fetch_flow_collection_end(FlowMappingEndToken)
  355. def fetch_flow_collection_end(self, TokenClass):
  356. # Reset possible simple key on the current level.
  357. self.remove_possible_simple_key()
  358. # Decrease the flow level.
  359. self.flow_level -= 1
  360. # No simple keys after ']' or '}'.
  361. self.allow_simple_key = False
  362. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  363. start_mark = self.get_mark()
  364. self.forward()
  365. end_mark = self.get_mark()
  366. self.tokens.append(TokenClass(start_mark, end_mark))
  367. def fetch_flow_entry(self):
  368. # Simple keys are allowed after ','.
  369. self.allow_simple_key = True
  370. # Reset possible simple key on the current level.
  371. self.remove_possible_simple_key()
  372. # Add FLOW-ENTRY.
  373. start_mark = self.get_mark()
  374. self.forward()
  375. end_mark = self.get_mark()
  376. self.tokens.append(FlowEntryToken(start_mark, end_mark))
  377. def fetch_block_entry(self):
  378. # Block context needs additional checks.
  379. if not self.flow_level:
  380. # Are we allowed to start a new entry?
  381. if not self.allow_simple_key:
  382. raise ScannerError(None, None,
  383. "sequence entries are not allowed here",
  384. self.get_mark())
  385. # We may need to add BLOCK-SEQUENCE-START.
  386. if self.add_indent(self.column):
  387. mark = self.get_mark()
  388. self.tokens.append(BlockSequenceStartToken(mark, mark))
  389. # It's an error for the block entry to occur in the flow context,
  390. # but we let the parser detect this.
  391. else:
  392. pass
  393. # Simple keys are allowed after '-'.
  394. self.allow_simple_key = True
  395. # Reset possible simple key on the current level.
  396. self.remove_possible_simple_key()
  397. # Add BLOCK-ENTRY.
  398. start_mark = self.get_mark()
  399. self.forward()
  400. end_mark = self.get_mark()
  401. self.tokens.append(BlockEntryToken(start_mark, end_mark))
  402. def fetch_key(self):
  403. # Block context needs additional checks.
  404. if not self.flow_level:
  405. # Are we allowed to start a key (not nessesary a simple)?
  406. if not self.allow_simple_key:
  407. raise ScannerError(None, None,
  408. "mapping keys are not allowed here",
  409. self.get_mark())
  410. # We may need to add BLOCK-MAPPING-START.
  411. if self.add_indent(self.column):
  412. mark = self.get_mark()
  413. self.tokens.append(BlockMappingStartToken(mark, mark))
  414. # Simple keys are allowed after '?' in the block context.
  415. self.allow_simple_key = not self.flow_level
  416. # Reset possible simple key on the current level.
  417. self.remove_possible_simple_key()
  418. # Add KEY.
  419. start_mark = self.get_mark()
  420. self.forward()
  421. end_mark = self.get_mark()
  422. self.tokens.append(KeyToken(start_mark, end_mark))
  423. def fetch_value(self):
  424. # Do we determine a simple key?
  425. if self.flow_level in self.possible_simple_keys:
  426. # Add KEY.
  427. key = self.possible_simple_keys[self.flow_level]
  428. del self.possible_simple_keys[self.flow_level]
  429. self.tokens.insert(key.token_number-self.tokens_taken,
  430. KeyToken(key.mark, key.mark))
  431. # If this key starts a new block mapping, we need to add
  432. # BLOCK-MAPPING-START.
  433. if not self.flow_level:
  434. if self.add_indent(key.column):
  435. self.tokens.insert(key.token_number-self.tokens_taken,
  436. BlockMappingStartToken(key.mark, key.mark))
  437. # There cannot be two simple keys one after another.
  438. self.allow_simple_key = False
  439. # It must be a part of a complex key.
  440. else:
  441. # Block context needs additional checks.
  442. # (Do we really need them? They will be catched by the parser
  443. # anyway.)
  444. if not self.flow_level:
  445. # We are allowed to start a complex value if and only if
  446. # we can start a simple key.
  447. if not self.allow_simple_key:
  448. raise ScannerError(None, None,
  449. "mapping values are not allowed here",
  450. self.get_mark())
  451. # If this value starts a new block mapping, we need to add
  452. # BLOCK-MAPPING-START. It will be detected as an error later by
  453. # the parser.
  454. if not self.flow_level:
  455. if self.add_indent(self.column):
  456. mark = self.get_mark()
  457. self.tokens.append(BlockMappingStartToken(mark, mark))
  458. # Simple keys are allowed after ':' in the block context.
  459. self.allow_simple_key = not self.flow_level
  460. # Reset possible simple key on the current level.
  461. self.remove_possible_simple_key()
  462. # Add VALUE.
  463. start_mark = self.get_mark()
  464. self.forward()
  465. end_mark = self.get_mark()
  466. self.tokens.append(ValueToken(start_mark, end_mark))
  467. def fetch_alias(self):
  468. # ALIAS could be a simple key.
  469. self.save_possible_simple_key()
  470. # No simple keys after ALIAS.
  471. self.allow_simple_key = False
  472. # Scan and add ALIAS.
  473. self.tokens.append(self.scan_anchor(AliasToken))
  474. def fetch_anchor(self):
  475. # ANCHOR could start a simple key.
  476. self.save_possible_simple_key()
  477. # No simple keys after ANCHOR.
  478. self.allow_simple_key = False
  479. # Scan and add ANCHOR.
  480. self.tokens.append(self.scan_anchor(AnchorToken))
  481. def fetch_tag(self):
  482. # TAG could start a simple key.
  483. self.save_possible_simple_key()
  484. # No simple keys after TAG.
  485. self.allow_simple_key = False
  486. # Scan and add TAG.
  487. self.tokens.append(self.scan_tag())
  488. def fetch_literal(self):
  489. self.fetch_block_scalar(style='|')
  490. def fetch_folded(self):
  491. self.fetch_block_scalar(style='>')
  492. def fetch_block_scalar(self, style):
  493. # A simple key may follow a block scalar.
  494. self.allow_simple_key = True
  495. # Reset possible simple key on the current level.
  496. self.remove_possible_simple_key()
  497. # Scan and add SCALAR.
  498. self.tokens.append(self.scan_block_scalar(style))
  499. def fetch_single(self):
  500. self.fetch_flow_scalar(style='\'')
  501. def fetch_double(self):
  502. self.fetch_flow_scalar(style='"')
  503. def fetch_flow_scalar(self, style):
  504. # A flow scalar could be a simple key.
  505. self.save_possible_simple_key()
  506. # No simple keys after flow scalars.
  507. self.allow_simple_key = False
  508. # Scan and add SCALAR.
  509. self.tokens.append(self.scan_flow_scalar(style))
  510. def fetch_plain(self):
  511. # A plain scalar could be a simple key.
  512. self.save_possible_simple_key()
  513. # No simple keys after plain scalars. But note that `scan_plain` will
  514. # change this flag if the scan is finished at the beginning of the
  515. # line.
  516. self.allow_simple_key = False
  517. # Scan and add SCALAR. May change `allow_simple_key`.
  518. self.tokens.append(self.scan_plain())
  519. # Checkers.
  520. def check_directive(self):
  521. # DIRECTIVE: ^ '%' ...
  522. # The '%' indicator is already checked.
  523. if self.column == 0:
  524. return True
  525. def check_document_start(self):
  526. # DOCUMENT-START: ^ '---' (' '|'\n')
  527. if self.column == 0:
  528. if self.prefix(3) == '---' \
  529. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  530. return True
  531. def check_document_end(self):
  532. # DOCUMENT-END: ^ '...' (' '|'\n')
  533. if self.column == 0:
  534. if self.prefix(3) == '...' \
  535. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  536. return True
  537. def check_block_entry(self):
  538. # BLOCK-ENTRY: '-' (' '|'\n')
  539. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  540. def check_key(self):
  541. # KEY(flow context): '?'
  542. if self.flow_level:
  543. return True
  544. # KEY(block context): '?' (' '|'\n')
  545. else:
  546. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  547. def check_value(self):
  548. # VALUE(flow context): ':'
  549. if self.flow_level:
  550. return True
  551. # VALUE(block context): ':' (' '|'\n')
  552. else:
  553. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  554. def check_plain(self):
  555. # A plain scalar may start with any non-space character except:
  556. # '-', '?', ':', ',', '[', ']', '{', '}',
  557. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  558. # '%', '@', '`'.
  559. #
  560. # It may also start with
  561. # '-', '?', ':'
  562. # if it is followed by a non-space character.
  563. #
  564. # Note that we limit the last rule to the block context (except the
  565. # '-' character) because we want the flow context to be space
  566. # independent.
  567. ch = self.peek()
  568. return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
  569. or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029'
  570. and (ch == '-' or (not self.flow_level and ch in '?:')))
  571. # Scanners.
  572. def scan_to_next_token(self):
  573. # We ignore spaces, line breaks and comments.
  574. # If we find a line break in the block context, we set the flag
  575. # `allow_simple_key` on.
  576. # The byte order mark is stripped if it's the first character in the
  577. # stream. We do not yet support BOM inside the stream as the
  578. # specification requires. Any such mark will be considered as a part
  579. # of the document.
  580. #
  581. # TODO: We need to make tab handling rules more sane. A good rule is
  582. # Tabs cannot precede tokens
  583. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  584. # KEY(block), VALUE(block), BLOCK-ENTRY
  585. # So the checking code is
  586. # if <TAB>:
  587. # self.allow_simple_keys = False
  588. # We also need to add the check for `allow_simple_keys == True` to
  589. # `unwind_indent` before issuing BLOCK-END.
  590. # Scanners for block, flow, and plain scalars need to be modified.
  591. if self.index == 0 and self.peek() == '\uFEFF':
  592. self.forward()
  593. found = False
  594. while not found:
  595. while self.peek() == ' ':
  596. self.forward()
  597. if self.peek() == '#':
  598. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  599. self.forward()
  600. if self.scan_line_break():
  601. if not self.flow_level:
  602. self.allow_simple_key = True
  603. else:
  604. found = True
  605. def scan_directive(self):
  606. # See the specification for details.
  607. start_mark = self.get_mark()
  608. self.forward()
  609. name = self.scan_directive_name(start_mark)
  610. value = None
  611. if name == 'YAML':
  612. value = self.scan_yaml_directive_value(start_mark)
  613. end_mark = self.get_mark()
  614. elif name == 'TAG':
  615. value = self.scan_tag_directive_value(start_mark)
  616. end_mark = self.get_mark()
  617. else:
  618. end_mark = self.get_mark()
  619. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  620. self.forward()
  621. self.scan_directive_ignored_line(start_mark)
  622. return DirectiveToken(name, value, start_mark, end_mark)
  623. def scan_directive_name(self, start_mark):
  624. # See the specification for details.
  625. length = 0
  626. ch = self.peek(length)
  627. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  628. or ch in '-_':
  629. length += 1
  630. ch = self.peek(length)
  631. if not length:
  632. raise ScannerError("while scanning a directive", start_mark,
  633. "expected alphabetic or numeric character, but found %r"
  634. % ch, self.get_mark())
  635. value = self.prefix(length)
  636. self.forward(length)
  637. ch = self.peek()
  638. if ch not in '\0 \r\n\x85\u2028\u2029':
  639. raise ScannerError("while scanning a directive", start_mark,
  640. "expected alphabetic or numeric character, but found %r"
  641. % ch, self.get_mark())
  642. return value
  643. def scan_yaml_directive_value(self, start_mark):
  644. # See the specification for details.
  645. while self.peek() == ' ':
  646. self.forward()
  647. major = self.scan_yaml_directive_number(start_mark)
  648. if self.peek() != '.':
  649. raise ScannerError("while scanning a directive", start_mark,
  650. "expected a digit or '.', but found %r" % self.peek(),
  651. self.get_mark())
  652. self.forward()
  653. minor = self.scan_yaml_directive_number(start_mark)
  654. if self.peek() not in '\0 \r\n\x85\u2028\u2029':
  655. raise ScannerError("while scanning a directive", start_mark,
  656. "expected a digit or ' ', but found %r" % self.peek(),
  657. self.get_mark())
  658. return (major, minor)
  659. def scan_yaml_directive_number(self, start_mark):
  660. # See the specification for details.
  661. ch = self.peek()
  662. if not ('0' <= ch <= '9'):
  663. raise ScannerError("while scanning a directive", start_mark,
  664. "expected a digit, but found %r" % ch, self.get_mark())
  665. length = 0
  666. while '0' <= self.peek(length) <= '9':
  667. length += 1
  668. value = int(self.prefix(length))
  669. self.forward(length)
  670. return value
  671. def scan_tag_directive_value(self, start_mark):
  672. # See the specification for details.
  673. while self.peek() == ' ':
  674. self.forward()
  675. handle = self.scan_tag_directive_handle(start_mark)
  676. while self.peek() == ' ':
  677. self.forward()
  678. prefix = self.scan_tag_directive_prefix(start_mark)
  679. return (handle, prefix)
  680. def scan_tag_directive_handle(self, start_mark):
  681. # See the specification for details.
  682. value = self.scan_tag_handle('directive', start_mark)
  683. ch = self.peek()
  684. if ch != ' ':
  685. raise ScannerError("while scanning a directive", start_mark,
  686. "expected ' ', but found %r" % ch, self.get_mark())
  687. return value
  688. def scan_tag_directive_prefix(self, start_mark):
  689. # See the specification for details.
  690. value = self.scan_tag_uri('directive', start_mark)
  691. ch = self.peek()
  692. if ch not in '\0 \r\n\x85\u2028\u2029':
  693. raise ScannerError("while scanning a directive", start_mark,
  694. "expected ' ', but found %r" % ch, self.get_mark())
  695. return value
  696. def scan_directive_ignored_line(self, start_mark):
  697. # See the specification for details.
  698. while self.peek() == ' ':
  699. self.forward()
  700. if self.peek() == '#':
  701. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  702. self.forward()
  703. ch = self.peek()
  704. if ch not in '\0\r\n\x85\u2028\u2029':
  705. raise ScannerError("while scanning a directive", start_mark,
  706. "expected a comment or a line break, but found %r"
  707. % ch, self.get_mark())
  708. self.scan_line_break()
  709. def scan_anchor(self, TokenClass):
  710. # The specification does not restrict characters for anchors and
  711. # aliases. This may lead to problems, for instance, the document:
  712. # [ *alias, value ]
  713. # can be interpteted in two ways, as
  714. # [ "value" ]
  715. # and
  716. # [ *alias , "value" ]
  717. # Therefore we restrict aliases to numbers and ASCII letters.
  718. start_mark = self.get_mark()
  719. indicator = self.peek()
  720. if indicator == '*':
  721. name = 'alias'
  722. else:
  723. name = 'anchor'
  724. self.forward()
  725. length = 0
  726. ch = self.peek(length)
  727. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  728. or ch in '-_':
  729. length += 1
  730. ch = self.peek(length)
  731. if not length:
  732. raise ScannerError("while scanning an %s" % name, start_mark,
  733. "expected alphabetic or numeric character, but found %r"
  734. % ch, self.get_mark())
  735. value = self.prefix(length)
  736. self.forward(length)
  737. ch = self.peek()
  738. if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
  739. raise ScannerError("while scanning an %s" % name, start_mark,
  740. "expected alphabetic or numeric character, but found %r"
  741. % ch, self.get_mark())
  742. end_mark = self.get_mark()
  743. return TokenClass(value, start_mark, end_mark)
  744. def scan_tag(self):
  745. # See the specification for details.
  746. start_mark = self.get_mark()
  747. ch = self.peek(1)
  748. if ch == '<':
  749. handle = None
  750. self.forward(2)
  751. suffix = self.scan_tag_uri('tag', start_mark)
  752. if self.peek() != '>':
  753. raise ScannerError("while parsing a tag", start_mark,
  754. "expected '>', but found %r" % self.peek(),
  755. self.get_mark())
  756. self.forward()
  757. elif ch in '\0 \t\r\n\x85\u2028\u2029':
  758. handle = None
  759. suffix = '!'
  760. self.forward()
  761. else:
  762. length = 1
  763. use_handle = False
  764. while ch not in '\0 \r\n\x85\u2028\u2029':
  765. if ch == '!':
  766. use_handle = True
  767. break
  768. length += 1
  769. ch = self.peek(length)
  770. handle = '!'
  771. if use_handle:
  772. handle = self.scan_tag_handle('tag', start_mark)
  773. else:
  774. handle = '!'
  775. self.forward()
  776. suffix = self.scan_tag_uri('tag', start_mark)
  777. ch = self.peek()
  778. if ch not in '\0 \r\n\x85\u2028\u2029':
  779. raise ScannerError("while scanning a tag", start_mark,
  780. "expected ' ', but found %r" % ch, self.get_mark())
  781. value = (handle, suffix)
  782. end_mark = self.get_mark()
  783. return TagToken(value, start_mark, end_mark)
  784. def scan_block_scalar(self, style):
  785. # See the specification for details.
  786. if style == '>':
  787. folded = True
  788. else:
  789. folded = False
  790. chunks = []
  791. start_mark = self.get_mark()
  792. # Scan the header.
  793. self.forward()
  794. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  795. self.scan_block_scalar_ignored_line(start_mark)
  796. # Determine the indentation level and go to the first non-empty line.
  797. min_indent = self.indent+1
  798. if min_indent < 1:
  799. min_indent = 1
  800. if increment is None:
  801. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  802. indent = max(min_indent, max_indent)
  803. else:
  804. indent = min_indent+increment-1
  805. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  806. line_break = ''
  807. # Scan the inner part of the block scalar.
  808. while self.column == indent and self.peek() != '\0':
  809. chunks.extend(breaks)
  810. leading_non_space = self.peek() not in ' \t'
  811. length = 0
  812. while self.peek(length) not in '\0\r\n\x85\u2028\u2029':
  813. length += 1
  814. chunks.append(self.prefix(length))
  815. self.forward(length)
  816. line_break = self.scan_line_break()
  817. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  818. if self.column == indent and self.peek() != '\0':
  819. # Unfortunately, folding rules are ambiguous.
  820. #
  821. # This is the folding according to the specification:
  822. if folded and line_break == '\n' \
  823. and leading_non_space and self.peek() not in ' \t':
  824. if not breaks:
  825. chunks.append(' ')
  826. else:
  827. chunks.append(line_break)
  828. # This is Clark Evans's interpretation (also in the spec
  829. # examples):
  830. #
  831. #if folded and line_break == '\n':
  832. # if not breaks:
  833. # if self.peek() not in ' \t':
  834. # chunks.append(' ')
  835. # else:
  836. # chunks.append(line_break)
  837. #else:
  838. # chunks.append(line_break)
  839. else:
  840. break
  841. # Chomp the tail.
  842. if chomping is not False:
  843. chunks.append(line_break)
  844. if chomping is True:
  845. chunks.extend(breaks)
  846. # We are done.
  847. return ScalarToken(''.join(chunks), False, start_mark, end_mark,
  848. style)
  849. def scan_block_scalar_indicators(self, start_mark):
  850. # See the specification for details.
  851. chomping = None
  852. increment = None
  853. ch = self.peek()
  854. if ch in '+-':
  855. if ch == '+':
  856. chomping = True
  857. else:
  858. chomping = False
  859. self.forward()
  860. ch = self.peek()
  861. if ch in '0123456789':
  862. increment = int(ch)
  863. if increment == 0:
  864. raise ScannerError("while scanning a block scalar", start_mark,
  865. "expected indentation indicator in the range 1-9, but found 0",
  866. self.get_mark())
  867. self.forward()
  868. elif ch in '0123456789':
  869. increment = int(ch)
  870. if increment == 0:
  871. raise ScannerError("while scanning a block scalar", start_mark,
  872. "expected indentation indicator in the range 1-9, but found 0",
  873. self.get_mark())
  874. self.forward()
  875. ch = self.peek()
  876. if ch in '+-':
  877. if ch == '+':
  878. chomping = True
  879. else:
  880. chomping = False
  881. self.forward()
  882. ch = self.peek()
  883. if ch not in '\0 \r\n\x85\u2028\u2029':
  884. raise ScannerError("while scanning a block scalar", start_mark,
  885. "expected chomping or indentation indicators, but found %r"
  886. % ch, self.get_mark())
  887. return chomping, increment
  888. def scan_block_scalar_ignored_line(self, start_mark):
  889. # See the specification for details.
  890. while self.peek() == ' ':
  891. self.forward()
  892. if self.peek() == '#':
  893. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  894. self.forward()
  895. ch = self.peek()
  896. if ch not in '\0\r\n\x85\u2028\u2029':
  897. raise ScannerError("while scanning a block scalar", start_mark,
  898. "expected a comment or a line break, but found %r" % ch,
  899. self.get_mark())
  900. self.scan_line_break()
  901. def scan_block_scalar_indentation(self):
  902. # See the specification for details.
  903. chunks = []
  904. max_indent = 0
  905. end_mark = self.get_mark()
  906. while self.peek() in ' \r\n\x85\u2028\u2029':
  907. if self.peek() != ' ':
  908. chunks.append(self.scan_line_break())
  909. end_mark = self.get_mark()
  910. else:
  911. self.forward()
  912. if self.column > max_indent:
  913. max_indent = self.column
  914. return chunks, max_indent, end_mark
  915. def scan_block_scalar_breaks(self, indent):
  916. # See the specification for details.
  917. chunks = []
  918. end_mark = self.get_mark()
  919. while self.column < indent and self.peek() == ' ':
  920. self.forward()
  921. while self.peek() in '\r\n\x85\u2028\u2029':
  922. chunks.append(self.scan_line_break())
  923. end_mark = self.get_mark()
  924. while self.column < indent and self.peek() == ' ':
  925. self.forward()
  926. return chunks, end_mark
  927. def scan_flow_scalar(self, style):
  928. # See the specification for details.
  929. # Note that we loose indentation rules for quoted scalars. Quoted
  930. # scalars don't need to adhere indentation because " and ' clearly
  931. # mark the beginning and the end of them. Therefore we are less
  932. # restrictive then the specification requires. We only need to check
  933. # that document separators are not included in scalars.
  934. if style == '"':
  935. double = True
  936. else:
  937. double = False
  938. chunks = []
  939. start_mark = self.get_mark()
  940. quote = self.peek()
  941. self.forward()
  942. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  943. while self.peek() != quote:
  944. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  945. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  946. self.forward()
  947. end_mark = self.get_mark()
  948. return ScalarToken(''.join(chunks), False, start_mark, end_mark,
  949. style)
  950. ESCAPE_REPLACEMENTS = {
  951. '0': '\0',
  952. 'a': '\x07',
  953. 'b': '\x08',
  954. 't': '\x09',
  955. '\t': '\x09',
  956. 'n': '\x0A',
  957. 'v': '\x0B',
  958. 'f': '\x0C',
  959. 'r': '\x0D',
  960. 'e': '\x1B',
  961. ' ': '\x20',
  962. '\"': '\"',
  963. '\\': '\\',
  964. 'N': '\x85',
  965. '_': '\xA0',
  966. 'L': '\u2028',
  967. 'P': '\u2029',
  968. }
  969. ESCAPE_CODES = {
  970. 'x': 2,
  971. 'u': 4,
  972. 'U': 8,
  973. }
  974. def scan_flow_scalar_non_spaces(self, double, start_mark):
  975. # See the specification for details.
  976. chunks = []
  977. while True:
  978. length = 0
  979. while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029':
  980. length += 1
  981. if length:
  982. chunks.append(self.prefix(length))
  983. self.forward(length)
  984. ch = self.peek()
  985. if not double and ch == '\'' and self.peek(1) == '\'':
  986. chunks.append('\'')
  987. self.forward(2)
  988. elif (double and ch == '\'') or (not double and ch in '\"\\'):
  989. chunks.append(ch)
  990. self.forward()
  991. elif double and ch == '\\':
  992. self.forward()
  993. ch = self.peek()
  994. if ch in self.ESCAPE_REPLACEMENTS:
  995. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  996. self.forward()
  997. elif ch in self.ESCAPE_CODES:
  998. length = self.ESCAPE_CODES[ch]
  999. self.forward()
  1000. for k in range(length):
  1001. if self.peek(k) not in '0123456789ABCDEFabcdef':
  1002. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1003. "expected escape sequence of %d hexdecimal numbers, but found %r" %
  1004. (length, self.peek(k)), self.get_mark())
  1005. code = int(self.prefix(length), 16)
  1006. chunks.append(chr(code))
  1007. self.forward(length)
  1008. elif ch in '\r\n\x85\u2028\u2029':
  1009. self.scan_line_break()
  1010. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1011. else:
  1012. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1013. "found unknown escape character %r" % ch, self.get_mark())
  1014. else:
  1015. return chunks
  1016. def scan_flow_scalar_spaces(self, double, start_mark):
  1017. # See the specification for details.
  1018. chunks = []
  1019. length = 0
  1020. while self.peek(length) in ' \t':
  1021. length += 1
  1022. whitespaces = self.prefix(length)
  1023. self.forward(length)
  1024. ch = self.peek()
  1025. if ch == '\0':
  1026. raise ScannerError("while scanning a quoted scalar", start_mark,
  1027. "found unexpected end of stream", self.get_mark())
  1028. elif ch in '\r\n\x85\u2028\u2029':
  1029. line_break = self.scan_line_break()
  1030. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1031. if line_break != '\n':
  1032. chunks.append(line_break)
  1033. elif not breaks:
  1034. chunks.append(' ')
  1035. chunks.extend(breaks)
  1036. else:
  1037. chunks.append(whitespaces)
  1038. return chunks
  1039. def scan_flow_scalar_breaks(self, double, start_mark):
  1040. # See the specification for details.
  1041. chunks = []
  1042. while True:
  1043. # Instead of checking indentation, we check for document
  1044. # separators.
  1045. prefix = self.prefix(3)
  1046. if (prefix == '---' or prefix == '...') \
  1047. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1048. raise ScannerError("while scanning a quoted scalar", start_mark,
  1049. "found unexpected document separator", self.get_mark())
  1050. while self.peek() in ' \t':
  1051. self.forward()
  1052. if self.peek() in '\r\n\x85\u2028\u2029':
  1053. chunks.append(self.scan_line_break())
  1054. else:
  1055. return chunks
  1056. def scan_plain(self):
  1057. # See the specification for details.
  1058. # We add an additional restriction for the flow context:
  1059. # plain scalars in the flow context cannot contain ',', ':' and '?'.
  1060. # We also keep track of the `allow_simple_key` flag here.
  1061. # Indentation rules are loosed for the flow context.
  1062. chunks = []
  1063. start_mark = self.get_mark()
  1064. end_mark = start_mark
  1065. indent = self.indent+1
  1066. # We allow zero indentation for scalars, but then we need to check for
  1067. # document separators at the beginning of the line.
  1068. #if indent == 0:
  1069. # indent = 1
  1070. spaces = []
  1071. while True:
  1072. length = 0
  1073. if self.peek() == '#':
  1074. break
  1075. while True:
  1076. ch = self.peek(length)
  1077. if ch in '\0 \t\r\n\x85\u2028\u2029' \
  1078. or (not self.flow_level and ch == ':' and
  1079. self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029') \
  1080. or (self.flow_level and ch in ',:?[]{}'):
  1081. break
  1082. length += 1
  1083. # It's not clear what we should do with ':' in the flow context.
  1084. if (self.flow_level and ch == ':'
  1085. and self.peek(length+1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}'):
  1086. self.forward(length)
  1087. raise ScannerError("while scanning a plain scalar", start_mark,
  1088. "found unexpected ':'", self.get_mark(),
  1089. "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
  1090. if length == 0:
  1091. break
  1092. self.allow_simple_key = False
  1093. chunks.extend(spaces)
  1094. chunks.append(self.prefix(length))
  1095. self.forward(length)
  1096. end_mark = self.get_mark()
  1097. spaces = self.scan_plain_spaces(indent, start_mark)
  1098. if not spaces or self.peek() == '#' \
  1099. or (not self.flow_level and self.column < indent):
  1100. break
  1101. return ScalarToken(''.join(chunks), True, start_mark, end_mark)
  1102. def scan_plain_spaces(self, indent, start_mark):
  1103. # See the specification for details.
  1104. # The specification is really confusing about tabs in plain scalars.
  1105. # We just forbid them completely. Do not use tabs in YAML!
  1106. chunks = []
  1107. length = 0
  1108. while self.peek(length) in ' ':
  1109. length += 1
  1110. whitespaces = self.prefix(length)
  1111. self.forward(length)
  1112. ch = self.peek()
  1113. if ch in '\r\n\x85\u2028\u2029':
  1114. line_break = self.scan_line_break()
  1115. self.allow_simple_key = True
  1116. prefix = self.prefix(3)
  1117. if (prefix == '---' or prefix == '...') \
  1118. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1119. return
  1120. breaks = []
  1121. while self.peek() in ' \r\n\x85\u2028\u2029':
  1122. if self.peek() == ' ':
  1123. self.forward()
  1124. else:
  1125. breaks.append(self.scan_line_break())
  1126. prefix = self.prefix(3)
  1127. if (prefix == '---' or prefix == '...') \
  1128. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1129. return
  1130. if line_break != '\n':
  1131. chunks.append(line_break)
  1132. elif not breaks:
  1133. chunks.append(' ')
  1134. chunks.extend(breaks)
  1135. elif whitespaces:
  1136. chunks.append(whitespaces)
  1137. return chunks
  1138. def scan_tag_handle(self, name, start_mark):
  1139. # See the specification for details.
  1140. # For some strange reasons, the specification does not allow '_' in
  1141. # tag handles. I have allowed it anyway.
  1142. ch = self.peek()
  1143. if ch != '!':
  1144. raise ScannerError("while scanning a %s" % name, start_mark,
  1145. "expected '!', but found %r" % ch, self.get_mark())
  1146. length = 1
  1147. ch = self.peek(length)
  1148. if ch != ' ':
  1149. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  1150. or ch in '-_':
  1151. length += 1
  1152. ch = self.peek(length)
  1153. if ch != '!':
  1154. self.forward(length)
  1155. raise ScannerError("while scanning a %s" % name, start_mark,
  1156. "expected '!', but found %r" % ch, self.get_mark())
  1157. length += 1
  1158. value = self.prefix(length)
  1159. self.forward(length)
  1160. return value
  1161. def scan_tag_uri(self, name, start_mark):
  1162. # See the specification for details.
  1163. # Note: we do not check if URI is well-formed.
  1164. chunks = []
  1165. length = 0
  1166. ch = self.peek(length)
  1167. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  1168. or ch in '-;/?:@&=+$,_.!~*\'()[]%':
  1169. if ch == '%':
  1170. chunks.append(self.prefix(length))
  1171. self.forward(length)
  1172. length = 0
  1173. chunks.append(self.scan_uri_escapes(name, start_mark))
  1174. else:
  1175. length += 1
  1176. ch = self.peek(length)
  1177. if length:
  1178. chunks.append(self.prefix(length))
  1179. self.forward(length)
  1180. length = 0
  1181. if not chunks:
  1182. raise ScannerError("while parsing a %s" % name, start_mark,
  1183. "expected URI, but found %r" % ch, self.get_mark())
  1184. return ''.join(chunks)
  1185. def scan_uri_escapes(self, name, start_mark):
  1186. # See the specification for details.
  1187. codes = []
  1188. mark = self.get_mark()
  1189. while self.peek() == '%':
  1190. self.forward()
  1191. for k in range(2):
  1192. if self.peek(k) not in '0123456789ABCDEFabcdef':
  1193. raise ScannerError("while scanning a %s" % name, start_mark,
  1194. "expected URI escape sequence of 2 hexdecimal numbers, but found %r"
  1195. % self.peek(k), self.get_mark())
  1196. codes.append(int(self.prefix(2), 16))
  1197. self.forward(2)
  1198. try:
  1199. value = bytes(codes).decode('utf-8')
  1200. except UnicodeDecodeError as exc:
  1201. raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
  1202. return value
  1203. def scan_line_break(self):
  1204. # Transforms:
  1205. # '\r\n' : '\n'
  1206. # '\r' : '\n'
  1207. # '\n' : '\n'
  1208. # '\x85' : '\n'
  1209. # '\u2028' : '\u2028'
  1210. # '\u2029 : '\u2029'
  1211. # default : ''
  1212. ch = self.peek()
  1213. if ch in '\r\n\x85':
  1214. if self.prefix(2) == '\r\n':
  1215. self.forward(2)
  1216. else:
  1217. self.forward()
  1218. return '\n'
  1219. elif ch in '\u2028\u2029':
  1220. self.forward()
  1221. return ch
  1222. return ''
  1223. #try:
  1224. # import psyco
  1225. # psyco.bind(Scanner)
  1226. #except ImportError:
  1227. # pass