scanner.py 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449
  1. # Scanner produces tokens of the following types:
  2. # STREAM-START
  3. # STREAM-END
  4. # DIRECTIVE(name, value)
  5. # DOCUMENT-START
  6. # DOCUMENT-END
  7. # BLOCK-SEQUENCE-START
  8. # BLOCK-MAPPING-START
  9. # BLOCK-END
  10. # FLOW-SEQUENCE-START
  11. # FLOW-MAPPING-START
  12. # FLOW-SEQUENCE-END
  13. # FLOW-MAPPING-END
  14. # BLOCK-ENTRY
  15. # FLOW-ENTRY
  16. # KEY
  17. # VALUE
  18. # ALIAS(value)
  19. # ANCHOR(value)
  20. # TAG(value)
  21. # SCALAR(value, plain, style)
  22. #
  23. # Read comments in the Scanner code for more details.
  24. #
  25. __all__ = ['Scanner', 'ScannerError']
  26. import sys
  27. from error import MarkedYAMLError
  28. from tokens import *
  29. class ScannerError(MarkedYAMLError):
  30. pass
  31. class SimpleKey(object):
  32. # See below simple keys treatment.
  33. def __init__(self, token_number, required, index, line, column, mark):
  34. self.token_number = token_number
  35. self.required = required
  36. self.index = index
  37. self.line = line
  38. self.column = column
  39. self.mark = mark
  40. class Scanner(object):
  41. def __init__(self):
  42. """Initialize the scanner."""
  43. # It is assumed that Scanner and Reader will have a common descendant.
  44. # Reader do the dirty work of checking for BOM and converting the
  45. # input data to Unicode. It also adds NUL to the end.
  46. #
  47. # Reader supports the following methods
  48. # self.peek(i=0) # peek the next i-th character
  49. # self.prefix(l=1) # peek the next l characters
  50. # self.forward(l=1) # read the next l characters and move the pointer.
  51. # Had we reached the end of the stream?
  52. self.done = False
  53. # The number of unclosed '{' and '['. `flow_level == 0` means block
  54. # context.
  55. self.flow_level = 0
  56. # List of processed tokens that are not yet emitted.
  57. self.tokens = []
  58. # Add the STREAM-START token.
  59. self.fetch_stream_start()
  60. # Number of tokens that were emitted through the `get_token` method.
  61. self.tokens_taken = 0
  62. # The current indentation level.
  63. self.indent = -1
  64. # Past indentation levels.
  65. self.indents = []
  66. # Variables related to simple keys treatment.
  67. # A simple key is a key that is not denoted by the '?' indicator.
  68. # Example of simple keys:
  69. # ---
  70. # block simple key: value
  71. # ? not a simple key:
  72. # : { flow simple key: value }
  73. # We emit the KEY token before all keys, so when we find a potential
  74. # simple key, we try to locate the corresponding ':' indicator.
  75. # Simple keys should be limited to a single line and 1024 characters.
  76. # Can a simple key start at the current position? A simple key may
  77. # start:
  78. # - at the beginning of the line, not counting indentation spaces
  79. # (in block context),
  80. # - after '{', '[', ',' (in the flow context),
  81. # - after '?', ':', '-' (in the block context).
  82. # In the block context, this flag also signifies if a block collection
  83. # may start at the current position.
  84. self.allow_simple_key = True
  85. # Keep track of possible simple keys. This is a dictionary. The key
  86. # is `flow_level`; there can be no more that one possible simple key
  87. # for each level. The value is a SimpleKey record:
  88. # (token_number, required, index, line, column, mark)
  89. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  90. # '[', or '{' tokens.
  91. self.possible_simple_keys = {}
  92. # Public methods.
  93. def check_token(self, *choices):
  94. # Check if the next token is one of the given types.
  95. while self.need_more_tokens():
  96. self.fetch_more_tokens()
  97. if self.tokens:
  98. if not choices:
  99. return True
  100. for choice in choices:
  101. if isinstance(self.tokens[0], choice):
  102. return True
  103. return False
  104. def peek_token(self):
  105. # Return the next token, but do not delete if from the queue.
  106. # Return None if no more tokens.
  107. while self.need_more_tokens():
  108. self.fetch_more_tokens()
  109. if self.tokens:
  110. return self.tokens[0]
  111. else:
  112. return None
  113. def get_token(self):
  114. # Return the next token.
  115. while self.need_more_tokens():
  116. self.fetch_more_tokens()
  117. if self.tokens:
  118. self.tokens_taken += 1
  119. return self.tokens.pop(0)
  120. # Private methods.
  121. def need_more_tokens(self):
  122. if self.done:
  123. return False
  124. if not self.tokens:
  125. return True
  126. # The current token may be a potential simple key, so we
  127. # need to look further.
  128. self.stale_possible_simple_keys()
  129. if self.next_possible_simple_key() == self.tokens_taken:
  130. return True
  131. def fetch_more_tokens(self):
  132. # Eat whitespaces and comments until we reach the next token.
  133. self.scan_to_next_token()
  134. # Remove obsolete possible simple keys.
  135. self.stale_possible_simple_keys()
  136. # Compare the current indentation and column. It may add some tokens
  137. # and decrease the current indentation level.
  138. self.unwind_indent(self.column)
  139. # Peek the next character.
  140. ch = self.peek()
  141. # Is it the end of stream?
  142. if ch == u'\0':
  143. return self.fetch_stream_end()
  144. # Is it a directive?
  145. if ch == u'%' and self.check_directive():
  146. return self.fetch_directive()
  147. # Is it the document start?
  148. if ch == u'-' and self.check_document_start():
  149. return self.fetch_document_start()
  150. # Is it the document end?
  151. if ch == u'.' and self.check_document_end():
  152. return self.fetch_document_end()
  153. # TODO: support for BOM within a stream.
  154. #if ch == u'\uFEFF':
  155. # return self.fetch_bom() <-- issue BOMToken
  156. # Note: the order of the following checks is NOT significant.
  157. # Is it the flow sequence start indicator?
  158. if ch == u'[':
  159. return self.fetch_flow_sequence_start()
  160. # Is it the flow mapping start indicator?
  161. if ch == u'{':
  162. return self.fetch_flow_mapping_start()
  163. # Is it the flow sequence end indicator?
  164. if ch == u']':
  165. return self.fetch_flow_sequence_end()
  166. # Is it the flow mapping end indicator?
  167. if ch == u'}':
  168. return self.fetch_flow_mapping_end()
  169. # Is it the flow entry indicator?
  170. if ch == u',':
  171. return self.fetch_flow_entry()
  172. # Is it the block entry indicator?
  173. if ch == u'-' and self.check_block_entry():
  174. return self.fetch_block_entry()
  175. # Is it the key indicator?
  176. if ch == u'?' and self.check_key():
  177. return self.fetch_key()
  178. # Is it the value indicator?
  179. if ch == u':' and self.check_value():
  180. return self.fetch_value()
  181. # Is it an alias?
  182. if ch == u'*':
  183. return self.fetch_alias()
  184. # Is it an anchor?
  185. if ch == u'&':
  186. return self.fetch_anchor()
  187. # Is it a tag?
  188. if ch == u'!':
  189. return self.fetch_tag()
  190. # Is it a literal scalar?
  191. if ch == u'|' and not self.flow_level:
  192. return self.fetch_literal()
  193. # Is it a folded scalar?
  194. if ch == u'>' and not self.flow_level:
  195. return self.fetch_folded()
  196. # Is it a single quoted scalar?
  197. if ch == u'\'':
  198. return self.fetch_single()
  199. # Is it a double quoted scalar?
  200. if ch == u'\"':
  201. return self.fetch_double()
  202. # It must be a plain scalar then.
  203. if self.check_plain():
  204. return self.fetch_plain()
  205. # No? It's an error. Let's produce a nice error message.
  206. raise ScannerError("while scanning for the next token", None,
  207. "found character %r that cannot start any token"
  208. % ch.encode('utf-8'), self.get_mark())
  209. # Simple keys treatment.
  210. def next_possible_simple_key(self):
  211. # Return the number of the nearest possible simple key. Actually we
  212. # don't need to loop through the whole dictionary. We may replace it
  213. # with the following code:
  214. # if not self.possible_simple_keys:
  215. # return None
  216. # return self.possible_simple_keys[
  217. # min(self.possible_simple_keys.keys())].token_number
  218. min_token_number = None
  219. for level in self.possible_simple_keys:
  220. key = self.possible_simple_keys[level]
  221. if min_token_number is None or key.token_number < min_token_number:
  222. min_token_number = key.token_number
  223. return min_token_number
  224. def stale_possible_simple_keys(self):
  225. # Remove entries that are no longer possible simple keys. According to
  226. # the YAML specification, simple keys
  227. # - should be limited to a single line,
  228. # - should be no longer than 1024 characters.
  229. # Disabling this procedure will allow simple keys of any length and
  230. # height (may cause problems if indentation is broken though).
  231. for level in self.possible_simple_keys.keys():
  232. key = self.possible_simple_keys[level]
  233. if key.line != self.line \
  234. or self.index-key.index > 1024:
  235. if key.required:
  236. raise ScannerError("while scanning a simple key", key.mark,
  237. "could not find expected ':'", self.get_mark())
  238. del self.possible_simple_keys[level]
  239. def save_possible_simple_key(self):
  240. # The next token may start a simple key. We check if it's possible
  241. # and save its position. This function is called for
  242. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  243. # Check if a simple key is required at the current position.
  244. required = not self.flow_level and self.indent == self.column
  245. # The next token might be a simple key. Let's save it's number and
  246. # position.
  247. if self.allow_simple_key:
  248. self.remove_possible_simple_key()
  249. token_number = self.tokens_taken+len(self.tokens)
  250. key = SimpleKey(token_number, required,
  251. self.index, self.line, self.column, self.get_mark())
  252. self.possible_simple_keys[self.flow_level] = key
  253. def remove_possible_simple_key(self):
  254. # Remove the saved possible key position at the current flow level.
  255. if self.flow_level in self.possible_simple_keys:
  256. key = self.possible_simple_keys[self.flow_level]
  257. if key.required:
  258. raise ScannerError("while scanning a simple key", key.mark,
  259. "could not find expected ':'", self.get_mark())
  260. del self.possible_simple_keys[self.flow_level]
  261. # Indentation functions.
  262. def unwind_indent(self, column):
  263. ## In flow context, tokens should respect indentation.
  264. ## Actually the condition should be `self.indent >= column` according to
  265. ## the spec. But this condition will prohibit intuitively correct
  266. ## constructions such as
  267. ## key : {
  268. ## }
  269. #if self.flow_level and self.indent > column:
  270. # raise ScannerError(None, None,
  271. # "invalid indentation or unclosed '[' or '{'",
  272. # self.get_mark())
  273. # In the flow context, indentation is ignored. We make the scanner less
  274. # restrictive then specification requires.
  275. if self.flow_level:
  276. return
  277. # In block context, we may need to issue the BLOCK-END tokens.
  278. while self.indent > column:
  279. mark = self.get_mark()
  280. self.indent = self.indents.pop()
  281. self.tokens.append(BlockEndToken(mark, mark))
  282. def add_indent(self, column):
  283. # Check if we need to increase indentation.
  284. if self.indent < column:
  285. self.indents.append(self.indent)
  286. self.indent = column
  287. return True
  288. return False
  289. # Fetchers.
  290. def fetch_stream_start(self):
  291. # We always add STREAM-START as the first token and STREAM-END as the
  292. # last token.
  293. # Read the token.
  294. mark = self.get_mark()
  295. # Add STREAM-START.
  296. self.tokens.append(StreamStartToken(mark, mark,
  297. encoding=self.encoding))
  298. def fetch_stream_end(self):
  299. # Set the current indentation to -1.
  300. self.unwind_indent(-1)
  301. # Reset simple keys.
  302. self.remove_possible_simple_key()
  303. self.allow_simple_key = False
  304. self.possible_simple_keys = {}
  305. # Read the token.
  306. mark = self.get_mark()
  307. # Add STREAM-END.
  308. self.tokens.append(StreamEndToken(mark, mark))
  309. # The steam is finished.
  310. self.done = True
  311. def fetch_directive(self):
  312. # Set the current indentation to -1.
  313. self.unwind_indent(-1)
  314. # Reset simple keys.
  315. self.remove_possible_simple_key()
  316. self.allow_simple_key = False
  317. # Scan and add DIRECTIVE.
  318. self.tokens.append(self.scan_directive())
  319. def fetch_document_start(self):
  320. self.fetch_document_indicator(DocumentStartToken)
  321. def fetch_document_end(self):
  322. self.fetch_document_indicator(DocumentEndToken)
  323. def fetch_document_indicator(self, TokenClass):
  324. # Set the current indentation to -1.
  325. self.unwind_indent(-1)
  326. # Reset simple keys. Note that there could not be a block collection
  327. # after '---'.
  328. self.remove_possible_simple_key()
  329. self.allow_simple_key = False
  330. # Add DOCUMENT-START or DOCUMENT-END.
  331. start_mark = self.get_mark()
  332. self.forward(3)
  333. end_mark = self.get_mark()
  334. self.tokens.append(TokenClass(start_mark, end_mark))
  335. def fetch_flow_sequence_start(self):
  336. self.fetch_flow_collection_start(FlowSequenceStartToken)
  337. def fetch_flow_mapping_start(self):
  338. self.fetch_flow_collection_start(FlowMappingStartToken)
  339. def fetch_flow_collection_start(self, TokenClass):
  340. # '[' and '{' may start a simple key.
  341. self.save_possible_simple_key()
  342. # Increase the flow level.
  343. self.flow_level += 1
  344. # Simple keys are allowed after '[' and '{'.
  345. self.allow_simple_key = True
  346. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  347. start_mark = self.get_mark()
  348. self.forward()
  349. end_mark = self.get_mark()
  350. self.tokens.append(TokenClass(start_mark, end_mark))
  351. def fetch_flow_sequence_end(self):
  352. self.fetch_flow_collection_end(FlowSequenceEndToken)
  353. def fetch_flow_mapping_end(self):
  354. self.fetch_flow_collection_end(FlowMappingEndToken)
  355. def fetch_flow_collection_end(self, TokenClass):
  356. # Reset possible simple key on the current level.
  357. self.remove_possible_simple_key()
  358. # Decrease the flow level.
  359. self.flow_level -= 1
  360. # No simple keys after ']' or '}'.
  361. self.allow_simple_key = False
  362. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  363. start_mark = self.get_mark()
  364. self.forward()
  365. end_mark = self.get_mark()
  366. self.tokens.append(TokenClass(start_mark, end_mark))
  367. def fetch_flow_entry(self):
  368. # Simple keys are allowed after ','.
  369. self.allow_simple_key = True
  370. # Reset possible simple key on the current level.
  371. self.remove_possible_simple_key()
  372. # Add FLOW-ENTRY.
  373. start_mark = self.get_mark()
  374. self.forward()
  375. end_mark = self.get_mark()
  376. self.tokens.append(FlowEntryToken(start_mark, end_mark))
  377. def fetch_block_entry(self):
  378. # Block context needs additional checks.
  379. if not self.flow_level:
  380. # Are we allowed to start a new entry?
  381. if not self.allow_simple_key:
  382. raise ScannerError(None, None,
  383. "sequence entries are not allowed here",
  384. self.get_mark())
  385. # We may need to add BLOCK-SEQUENCE-START.
  386. if self.add_indent(self.column):
  387. mark = self.get_mark()
  388. self.tokens.append(BlockSequenceStartToken(mark, mark))
  389. # It's an error for the block entry to occur in the flow context,
  390. # but we let the parser detect this.
  391. else:
  392. pass
  393. # Simple keys are allowed after '-'.
  394. self.allow_simple_key = True
  395. # Reset possible simple key on the current level.
  396. self.remove_possible_simple_key()
  397. # Add BLOCK-ENTRY.
  398. start_mark = self.get_mark()
  399. self.forward()
  400. end_mark = self.get_mark()
  401. self.tokens.append(BlockEntryToken(start_mark, end_mark))
  402. def fetch_key(self):
  403. # Block context needs additional checks.
  404. if not self.flow_level:
  405. # Are we allowed to start a key (not necessary a simple)?
  406. if not self.allow_simple_key:
  407. raise ScannerError(None, None,
  408. "mapping keys are not allowed here",
  409. self.get_mark())
  410. # We may need to add BLOCK-MAPPING-START.
  411. if self.add_indent(self.column):
  412. mark = self.get_mark()
  413. self.tokens.append(BlockMappingStartToken(mark, mark))
  414. # Simple keys are allowed after '?' in the block context.
  415. self.allow_simple_key = not self.flow_level
  416. # Reset possible simple key on the current level.
  417. self.remove_possible_simple_key()
  418. # Add KEY.
  419. start_mark = self.get_mark()
  420. self.forward()
  421. end_mark = self.get_mark()
  422. self.tokens.append(KeyToken(start_mark, end_mark))
  423. def fetch_value(self):
  424. # Do we determine a simple key?
  425. if self.flow_level in self.possible_simple_keys:
  426. # Add KEY.
  427. key = self.possible_simple_keys[self.flow_level]
  428. del self.possible_simple_keys[self.flow_level]
  429. self.tokens.insert(key.token_number-self.tokens_taken,
  430. KeyToken(key.mark, key.mark))
  431. # If this key starts a new block mapping, we need to add
  432. # BLOCK-MAPPING-START.
  433. if not self.flow_level:
  434. if self.add_indent(key.column):
  435. self.tokens.insert(key.token_number-self.tokens_taken,
  436. BlockMappingStartToken(key.mark, key.mark))
  437. # There cannot be two simple keys one after another.
  438. self.allow_simple_key = False
  439. # It must be a part of a complex key.
  440. else:
  441. # Block context needs additional checks.
  442. # (Do we really need them? They will be caught by the parser
  443. # anyway.)
  444. if not self.flow_level:
  445. # We are allowed to start a complex value if and only if
  446. # we can start a simple key.
  447. if not self.allow_simple_key:
  448. raise ScannerError(None, None,
  449. "mapping values are not allowed here",
  450. self.get_mark())
  451. # If this value starts a new block mapping, we need to add
  452. # BLOCK-MAPPING-START. It will be detected as an error later by
  453. # the parser.
  454. if not self.flow_level:
  455. if self.add_indent(self.column):
  456. mark = self.get_mark()
  457. self.tokens.append(BlockMappingStartToken(mark, mark))
  458. # Simple keys are allowed after ':' in the block context.
  459. self.allow_simple_key = not self.flow_level
  460. # Reset possible simple key on the current level.
  461. self.remove_possible_simple_key()
  462. # Add VALUE.
  463. start_mark = self.get_mark()
  464. self.forward()
  465. end_mark = self.get_mark()
  466. self.tokens.append(ValueToken(start_mark, end_mark))
  467. def fetch_alias(self):
  468. # ALIAS could be a simple key.
  469. self.save_possible_simple_key()
  470. # No simple keys after ALIAS.
  471. self.allow_simple_key = False
  472. # Scan and add ALIAS.
  473. self.tokens.append(self.scan_anchor(AliasToken))
  474. def fetch_anchor(self):
  475. # ANCHOR could start a simple key.
  476. self.save_possible_simple_key()
  477. # No simple keys after ANCHOR.
  478. self.allow_simple_key = False
  479. # Scan and add ANCHOR.
  480. self.tokens.append(self.scan_anchor(AnchorToken))
  481. def fetch_tag(self):
  482. # TAG could start a simple key.
  483. self.save_possible_simple_key()
  484. # No simple keys after TAG.
  485. self.allow_simple_key = False
  486. # Scan and add TAG.
  487. self.tokens.append(self.scan_tag())
  488. def fetch_literal(self):
  489. self.fetch_block_scalar(style='|')
  490. def fetch_folded(self):
  491. self.fetch_block_scalar(style='>')
  492. def fetch_block_scalar(self, style):
  493. # A simple key may follow a block scalar.
  494. self.allow_simple_key = True
  495. # Reset possible simple key on the current level.
  496. self.remove_possible_simple_key()
  497. # Scan and add SCALAR.
  498. self.tokens.append(self.scan_block_scalar(style))
  499. def fetch_single(self):
  500. self.fetch_flow_scalar(style='\'')
  501. def fetch_double(self):
  502. self.fetch_flow_scalar(style='"')
  503. def fetch_flow_scalar(self, style):
  504. # A flow scalar could be a simple key.
  505. self.save_possible_simple_key()
  506. # No simple keys after flow scalars.
  507. self.allow_simple_key = False
  508. # Scan and add SCALAR.
  509. self.tokens.append(self.scan_flow_scalar(style))
  510. def fetch_plain(self):
  511. # A plain scalar could be a simple key.
  512. self.save_possible_simple_key()
  513. # No simple keys after plain scalars. But note that `scan_plain` will
  514. # change this flag if the scan is finished at the beginning of the
  515. # line.
  516. self.allow_simple_key = False
  517. # Scan and add SCALAR. May change `allow_simple_key`.
  518. self.tokens.append(self.scan_plain())
  519. # Checkers.
  520. def check_directive(self):
  521. # DIRECTIVE: ^ '%' ...
  522. # The '%' indicator is already checked.
  523. if self.column == 0:
  524. return True
  525. def check_document_start(self):
  526. # DOCUMENT-START: ^ '---' (' '|'\n')
  527. if self.column == 0:
  528. if self.prefix(3) == u'---' \
  529. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  530. return True
  531. def check_document_end(self):
  532. # DOCUMENT-END: ^ '...' (' '|'\n')
  533. if self.column == 0:
  534. if self.prefix(3) == u'...' \
  535. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  536. return True
  537. def check_block_entry(self):
  538. # BLOCK-ENTRY: '-' (' '|'\n')
  539. return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
  540. def check_key(self):
  541. # KEY(flow context): '?'
  542. if self.flow_level:
  543. return True
  544. # KEY(block context): '?' (' '|'\n')
  545. else:
  546. return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
  547. def check_value(self):
  548. # VALUE(flow context): ':'
  549. if self.flow_level:
  550. return True
  551. # VALUE(block context): ':' (' '|'\n')
  552. else:
  553. return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
  554. def check_plain(self):
  555. # A plain scalar may start with any non-space character except:
  556. # '-', '?', ':', ',', '[', ']', '{', '}',
  557. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  558. # '%', '@', '`'.
  559. #
  560. # It may also start with
  561. # '-', '?', ':'
  562. # if it is followed by a non-space character.
  563. #
  564. # Note that we limit the last rule to the block context (except the
  565. # '-' character) because we want the flow context to be space
  566. # independent.
  567. ch = self.peek()
  568. return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
  569. or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
  570. and (ch == u'-' or (not self.flow_level and ch in u'?:')))
  571. # Scanners.
  572. def scan_to_next_token(self):
  573. # We ignore spaces, line breaks and comments.
  574. # If we find a line break in the block context, we set the flag
  575. # `allow_simple_key` on.
  576. # The byte order mark is stripped if it's the first character in the
  577. # stream. We do not yet support BOM inside the stream as the
  578. # specification requires. Any such mark will be considered as a part
  579. # of the document.
  580. #
  581. # TODO: We need to make tab handling rules more sane. A good rule is
  582. # Tabs cannot precede tokens
  583. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  584. # KEY(block), VALUE(block), BLOCK-ENTRY
  585. # So the checking code is
  586. # if <TAB>:
  587. # self.allow_simple_keys = False
  588. # We also need to add the check for `allow_simple_keys == True` to
  589. # `unwind_indent` before issuing BLOCK-END.
  590. # Scanners for block, flow, and plain scalars need to be modified.
  591. if self.index == 0 and self.peek() == u'\uFEFF':
  592. self.forward()
  593. found = False
  594. while not found:
  595. while self.peek() == u' ':
  596. self.forward()
  597. if self.peek() == u'#':
  598. while self.peek() not in u'\0\r\n\x85\u2028\u2029':
  599. self.forward()
  600. if self.scan_line_break():
  601. if not self.flow_level:
  602. self.allow_simple_key = True
  603. else:
  604. found = True
  605. def scan_directive(self):
  606. # See the specification for details.
  607. start_mark = self.get_mark()
  608. self.forward()
  609. name = self.scan_directive_name(start_mark)
  610. value = None
  611. if name == u'YAML':
  612. value = self.scan_yaml_directive_value(start_mark)
  613. end_mark = self.get_mark()
  614. elif name == u'TAG':
  615. value = self.scan_tag_directive_value(start_mark)
  616. end_mark = self.get_mark()
  617. else:
  618. end_mark = self.get_mark()
  619. while self.peek() not in u'\0\r\n\x85\u2028\u2029':
  620. self.forward()
  621. self.scan_directive_ignored_line(start_mark)
  622. return DirectiveToken(name, value, start_mark, end_mark)
  623. def scan_directive_name(self, start_mark):
  624. # See the specification for details.
  625. length = 0
  626. ch = self.peek(length)
  627. while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
  628. or ch in u'-_':
  629. length += 1
  630. ch = self.peek(length)
  631. if not length:
  632. raise ScannerError("while scanning a directive", start_mark,
  633. "expected alphabetic or numeric character, but found %r"
  634. % ch.encode('utf-8'), self.get_mark())
  635. value = self.prefix(length)
  636. self.forward(length)
  637. ch = self.peek()
  638. if ch not in u'\0 \r\n\x85\u2028\u2029':
  639. raise ScannerError("while scanning a directive", start_mark,
  640. "expected alphabetic or numeric character, but found %r"
  641. % ch.encode('utf-8'), self.get_mark())
  642. return value
  643. def scan_yaml_directive_value(self, start_mark):
  644. # See the specification for details.
  645. while self.peek() == u' ':
  646. self.forward()
  647. major = self.scan_yaml_directive_number(start_mark)
  648. if self.peek() != '.':
  649. raise ScannerError("while scanning a directive", start_mark,
  650. "expected a digit or '.', but found %r"
  651. % self.peek().encode('utf-8'),
  652. self.get_mark())
  653. self.forward()
  654. minor = self.scan_yaml_directive_number(start_mark)
  655. if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
  656. raise ScannerError("while scanning a directive", start_mark,
  657. "expected a digit or ' ', but found %r"
  658. % self.peek().encode('utf-8'),
  659. self.get_mark())
  660. return (major, minor)
  661. def scan_yaml_directive_number(self, start_mark):
  662. # See the specification for details.
  663. ch = self.peek()
  664. if not (u'0' <= ch <= u'9'):
  665. raise ScannerError("while scanning a directive", start_mark,
  666. "expected a digit, but found %r" % ch.encode('utf-8'),
  667. self.get_mark())
  668. length = 0
  669. while u'0' <= self.peek(length) <= u'9':
  670. length += 1
  671. value = int(self.prefix(length))
  672. self.forward(length)
  673. return value
  674. def scan_tag_directive_value(self, start_mark):
  675. # See the specification for details.
  676. while self.peek() == u' ':
  677. self.forward()
  678. handle = self.scan_tag_directive_handle(start_mark)
  679. while self.peek() == u' ':
  680. self.forward()
  681. prefix = self.scan_tag_directive_prefix(start_mark)
  682. return (handle, prefix)
  683. def scan_tag_directive_handle(self, start_mark):
  684. # See the specification for details.
  685. value = self.scan_tag_handle('directive', start_mark)
  686. ch = self.peek()
  687. if ch != u' ':
  688. raise ScannerError("while scanning a directive", start_mark,
  689. "expected ' ', but found %r" % ch.encode('utf-8'),
  690. self.get_mark())
  691. return value
  692. def scan_tag_directive_prefix(self, start_mark):
  693. # See the specification for details.
  694. value = self.scan_tag_uri('directive', start_mark)
  695. ch = self.peek()
  696. if ch not in u'\0 \r\n\x85\u2028\u2029':
  697. raise ScannerError("while scanning a directive", start_mark,
  698. "expected ' ', but found %r" % ch.encode('utf-8'),
  699. self.get_mark())
  700. return value
  701. def scan_directive_ignored_line(self, start_mark):
  702. # See the specification for details.
  703. while self.peek() == u' ':
  704. self.forward()
  705. if self.peek() == u'#':
  706. while self.peek() not in u'\0\r\n\x85\u2028\u2029':
  707. self.forward()
  708. ch = self.peek()
  709. if ch not in u'\0\r\n\x85\u2028\u2029':
  710. raise ScannerError("while scanning a directive", start_mark,
  711. "expected a comment or a line break, but found %r"
  712. % ch.encode('utf-8'), self.get_mark())
  713. self.scan_line_break()
  714. def scan_anchor(self, TokenClass):
  715. # The specification does not restrict characters for anchors and
  716. # aliases. This may lead to problems, for instance, the document:
  717. # [ *alias, value ]
  718. # can be interpreted in two ways, as
  719. # [ "value" ]
  720. # and
  721. # [ *alias , "value" ]
  722. # Therefore we restrict aliases to numbers and ASCII letters.
  723. start_mark = self.get_mark()
  724. indicator = self.peek()
  725. if indicator == u'*':
  726. name = 'alias'
  727. else:
  728. name = 'anchor'
  729. self.forward()
  730. length = 0
  731. ch = self.peek(length)
  732. while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
  733. or ch in u'-_':
  734. length += 1
  735. ch = self.peek(length)
  736. if not length:
  737. raise ScannerError("while scanning an %s" % name, start_mark,
  738. "expected alphabetic or numeric character, but found %r"
  739. % ch.encode('utf-8'), self.get_mark())
  740. value = self.prefix(length)
  741. self.forward(length)
  742. ch = self.peek()
  743. if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
  744. raise ScannerError("while scanning an %s" % name, start_mark,
  745. "expected alphabetic or numeric character, but found %r"
  746. % ch.encode('utf-8'), self.get_mark())
  747. end_mark = self.get_mark()
  748. return TokenClass(value, start_mark, end_mark)
  749. def scan_tag(self):
  750. # See the specification for details.
  751. start_mark = self.get_mark()
  752. ch = self.peek(1)
  753. if ch == u'<':
  754. handle = None
  755. self.forward(2)
  756. suffix = self.scan_tag_uri('tag', start_mark)
  757. if self.peek() != u'>':
  758. raise ScannerError("while parsing a tag", start_mark,
  759. "expected '>', but found %r" % self.peek().encode('utf-8'),
  760. self.get_mark())
  761. self.forward()
  762. elif ch in u'\0 \t\r\n\x85\u2028\u2029':
  763. handle = None
  764. suffix = u'!'
  765. self.forward()
  766. else:
  767. length = 1
  768. use_handle = False
  769. while ch not in u'\0 \r\n\x85\u2028\u2029':
  770. if ch == u'!':
  771. use_handle = True
  772. break
  773. length += 1
  774. ch = self.peek(length)
  775. handle = u'!'
  776. if use_handle:
  777. handle = self.scan_tag_handle('tag', start_mark)
  778. else:
  779. handle = u'!'
  780. self.forward()
  781. suffix = self.scan_tag_uri('tag', start_mark)
  782. ch = self.peek()
  783. if ch not in u'\0 \r\n\x85\u2028\u2029':
  784. raise ScannerError("while scanning a tag", start_mark,
  785. "expected ' ', but found %r" % ch.encode('utf-8'),
  786. self.get_mark())
  787. value = (handle, suffix)
  788. end_mark = self.get_mark()
  789. return TagToken(value, start_mark, end_mark)
  790. def scan_block_scalar(self, style):
  791. # See the specification for details.
  792. if style == '>':
  793. folded = True
  794. else:
  795. folded = False
  796. chunks = []
  797. start_mark = self.get_mark()
  798. # Scan the header.
  799. self.forward()
  800. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  801. self.scan_block_scalar_ignored_line(start_mark)
  802. # Determine the indentation level and go to the first non-empty line.
  803. min_indent = self.indent+1
  804. if min_indent < 1:
  805. min_indent = 1
  806. if increment is None:
  807. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  808. indent = max(min_indent, max_indent)
  809. else:
  810. indent = min_indent+increment-1
  811. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  812. line_break = u''
  813. # Scan the inner part of the block scalar.
  814. while self.column == indent and self.peek() != u'\0':
  815. chunks.extend(breaks)
  816. leading_non_space = self.peek() not in u' \t'
  817. length = 0
  818. while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
  819. length += 1
  820. chunks.append(self.prefix(length))
  821. self.forward(length)
  822. line_break = self.scan_line_break()
  823. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  824. if self.column == indent and self.peek() != u'\0':
  825. # Unfortunately, folding rules are ambiguous.
  826. #
  827. # This is the folding according to the specification:
  828. if folded and line_break == u'\n' \
  829. and leading_non_space and self.peek() not in u' \t':
  830. if not breaks:
  831. chunks.append(u' ')
  832. else:
  833. chunks.append(line_break)
  834. # This is Clark Evans's interpretation (also in the spec
  835. # examples):
  836. #
  837. #if folded and line_break == u'\n':
  838. # if not breaks:
  839. # if self.peek() not in ' \t':
  840. # chunks.append(u' ')
  841. # else:
  842. # chunks.append(line_break)
  843. #else:
  844. # chunks.append(line_break)
  845. else:
  846. break
  847. # Chomp the tail.
  848. if chomping is not False:
  849. chunks.append(line_break)
  850. if chomping is True:
  851. chunks.extend(breaks)
  852. # We are done.
  853. return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
  854. style)
  855. def scan_block_scalar_indicators(self, start_mark):
  856. # See the specification for details.
  857. chomping = None
  858. increment = None
  859. ch = self.peek()
  860. if ch in u'+-':
  861. if ch == '+':
  862. chomping = True
  863. else:
  864. chomping = False
  865. self.forward()
  866. ch = self.peek()
  867. if ch in u'0123456789':
  868. increment = int(ch)
  869. if increment == 0:
  870. raise ScannerError("while scanning a block scalar", start_mark,
  871. "expected indentation indicator in the range 1-9, but found 0",
  872. self.get_mark())
  873. self.forward()
  874. elif ch in u'0123456789':
  875. increment = int(ch)
  876. if increment == 0:
  877. raise ScannerError("while scanning a block scalar", start_mark,
  878. "expected indentation indicator in the range 1-9, but found 0",
  879. self.get_mark())
  880. self.forward()
  881. ch = self.peek()
  882. if ch in u'+-':
  883. if ch == '+':
  884. chomping = True
  885. else:
  886. chomping = False
  887. self.forward()
  888. ch = self.peek()
  889. if ch not in u'\0 \r\n\x85\u2028\u2029':
  890. raise ScannerError("while scanning a block scalar", start_mark,
  891. "expected chomping or indentation indicators, but found %r"
  892. % ch.encode('utf-8'), self.get_mark())
  893. return chomping, increment
  894. def scan_block_scalar_ignored_line(self, start_mark):
  895. # See the specification for details.
  896. while self.peek() == u' ':
  897. self.forward()
  898. if self.peek() == u'#':
  899. while self.peek() not in u'\0\r\n\x85\u2028\u2029':
  900. self.forward()
  901. ch = self.peek()
  902. if ch not in u'\0\r\n\x85\u2028\u2029':
  903. raise ScannerError("while scanning a block scalar", start_mark,
  904. "expected a comment or a line break, but found %r"
  905. % ch.encode('utf-8'), self.get_mark())
  906. self.scan_line_break()
  907. def scan_block_scalar_indentation(self):
  908. # See the specification for details.
  909. chunks = []
  910. max_indent = 0
  911. end_mark = self.get_mark()
  912. while self.peek() in u' \r\n\x85\u2028\u2029':
  913. if self.peek() != u' ':
  914. chunks.append(self.scan_line_break())
  915. end_mark = self.get_mark()
  916. else:
  917. self.forward()
  918. if self.column > max_indent:
  919. max_indent = self.column
  920. return chunks, max_indent, end_mark
  921. def scan_block_scalar_breaks(self, indent):
  922. # See the specification for details.
  923. chunks = []
  924. end_mark = self.get_mark()
  925. while self.column < indent and self.peek() == u' ':
  926. self.forward()
  927. while self.peek() in u'\r\n\x85\u2028\u2029':
  928. chunks.append(self.scan_line_break())
  929. end_mark = self.get_mark()
  930. while self.column < indent and self.peek() == u' ':
  931. self.forward()
  932. return chunks, end_mark
  933. def scan_flow_scalar(self, style):
  934. # See the specification for details.
  935. # Note that we loose indentation rules for quoted scalars. Quoted
  936. # scalars don't need to adhere indentation because " and ' clearly
  937. # mark the beginning and the end of them. Therefore we are less
  938. # restrictive then the specification requires. We only need to check
  939. # that document separators are not included in scalars.
  940. if style == '"':
  941. double = True
  942. else:
  943. double = False
  944. chunks = []
  945. start_mark = self.get_mark()
  946. quote = self.peek()
  947. self.forward()
  948. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  949. while self.peek() != quote:
  950. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  951. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  952. self.forward()
  953. end_mark = self.get_mark()
  954. return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
  955. style)
  956. ESCAPE_REPLACEMENTS = {
  957. u'0': u'\0',
  958. u'a': u'\x07',
  959. u'b': u'\x08',
  960. u't': u'\x09',
  961. u'\t': u'\x09',
  962. u'n': u'\x0A',
  963. u'v': u'\x0B',
  964. u'f': u'\x0C',
  965. u'r': u'\x0D',
  966. u'e': u'\x1B',
  967. u' ': u'\x20',
  968. u'\"': u'\"',
  969. u'\\': u'\\',
  970. u'/': u'/',
  971. u'N': u'\x85',
  972. u'_': u'\xA0',
  973. u'L': u'\u2028',
  974. u'P': u'\u2029',
  975. }
  976. ESCAPE_CODES = {
  977. u'x': 2,
  978. u'u': 4,
  979. u'U': 8,
  980. }
  981. def scan_flow_scalar_non_spaces(self, double, start_mark):
  982. # See the specification for details.
  983. chunks = []
  984. while True:
  985. length = 0
  986. while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
  987. length += 1
  988. if length:
  989. chunks.append(self.prefix(length))
  990. self.forward(length)
  991. ch = self.peek()
  992. if not double and ch == u'\'' and self.peek(1) == u'\'':
  993. chunks.append(u'\'')
  994. self.forward(2)
  995. elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
  996. chunks.append(ch)
  997. self.forward()
  998. elif double and ch == u'\\':
  999. self.forward()
  1000. ch = self.peek()
  1001. if ch in self.ESCAPE_REPLACEMENTS:
  1002. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  1003. self.forward()
  1004. elif ch in self.ESCAPE_CODES:
  1005. length = self.ESCAPE_CODES[ch]
  1006. self.forward()
  1007. for k in range(length):
  1008. if self.peek(k) not in u'0123456789ABCDEFabcdef':
  1009. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1010. "expected escape sequence of %d hexdecimal numbers, but found %r" %
  1011. (length, self.peek(k).encode('utf-8')), self.get_mark())
  1012. code = int(self.prefix(length), 16)
  1013. if code <= sys.maxunicode:
  1014. chunks.append(unichr(code))
  1015. else:
  1016. chunks.append(('\\U%08x' % code).decode('unicode-escape'))
  1017. self.forward(length)
  1018. elif ch in u'\r\n\x85\u2028\u2029':
  1019. self.scan_line_break()
  1020. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1021. else:
  1022. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1023. "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
  1024. else:
  1025. return chunks
  1026. def scan_flow_scalar_spaces(self, double, start_mark):
  1027. # See the specification for details.
  1028. chunks = []
  1029. length = 0
  1030. while self.peek(length) in u' \t':
  1031. length += 1
  1032. whitespaces = self.prefix(length)
  1033. self.forward(length)
  1034. ch = self.peek()
  1035. if ch == u'\0':
  1036. raise ScannerError("while scanning a quoted scalar", start_mark,
  1037. "found unexpected end of stream", self.get_mark())
  1038. elif ch in u'\r\n\x85\u2028\u2029':
  1039. line_break = self.scan_line_break()
  1040. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1041. if line_break != u'\n':
  1042. chunks.append(line_break)
  1043. elif not breaks:
  1044. chunks.append(u' ')
  1045. chunks.extend(breaks)
  1046. else:
  1047. chunks.append(whitespaces)
  1048. return chunks
  1049. def scan_flow_scalar_breaks(self, double, start_mark):
  1050. # See the specification for details.
  1051. chunks = []
  1052. while True:
  1053. # Instead of checking indentation, we check for document
  1054. # separators.
  1055. prefix = self.prefix(3)
  1056. if (prefix == u'---' or prefix == u'...') \
  1057. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  1058. raise ScannerError("while scanning a quoted scalar", start_mark,
  1059. "found unexpected document separator", self.get_mark())
  1060. while self.peek() in u' \t':
  1061. self.forward()
  1062. if self.peek() in u'\r\n\x85\u2028\u2029':
  1063. chunks.append(self.scan_line_break())
  1064. else:
  1065. return chunks
  1066. def scan_plain(self):
  1067. # See the specification for details.
  1068. # We add an additional restriction for the flow context:
  1069. # plain scalars in the flow context cannot contain ',' or '?'.
  1070. # We also keep track of the `allow_simple_key` flag here.
  1071. # Indentation rules are loosed for the flow context.
  1072. chunks = []
  1073. start_mark = self.get_mark()
  1074. end_mark = start_mark
  1075. indent = self.indent+1
  1076. # We allow zero indentation for scalars, but then we need to check for
  1077. # document separators at the beginning of the line.
  1078. #if indent == 0:
  1079. # indent = 1
  1080. spaces = []
  1081. while True:
  1082. length = 0
  1083. if self.peek() == u'#':
  1084. break
  1085. while True:
  1086. ch = self.peek(length)
  1087. if ch in u'\0 \t\r\n\x85\u2028\u2029' \
  1088. or (ch == u':' and
  1089. self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029'
  1090. + (u',[]{}' if self.flow_level else u''))\
  1091. or (self.flow_level and ch in u',?[]{}'):
  1092. break
  1093. length += 1
  1094. if length == 0:
  1095. break
  1096. self.allow_simple_key = False
  1097. chunks.extend(spaces)
  1098. chunks.append(self.prefix(length))
  1099. self.forward(length)
  1100. end_mark = self.get_mark()
  1101. spaces = self.scan_plain_spaces(indent, start_mark)
  1102. if not spaces or self.peek() == u'#' \
  1103. or (not self.flow_level and self.column < indent):
  1104. break
  1105. return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
  1106. def scan_plain_spaces(self, indent, start_mark):
  1107. # See the specification for details.
  1108. # The specification is really confusing about tabs in plain scalars.
  1109. # We just forbid them completely. Do not use tabs in YAML!
  1110. chunks = []
  1111. length = 0
  1112. while self.peek(length) in u' ':
  1113. length += 1
  1114. whitespaces = self.prefix(length)
  1115. self.forward(length)
  1116. ch = self.peek()
  1117. if ch in u'\r\n\x85\u2028\u2029':
  1118. line_break = self.scan_line_break()
  1119. self.allow_simple_key = True
  1120. prefix = self.prefix(3)
  1121. if (prefix == u'---' or prefix == u'...') \
  1122. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  1123. return
  1124. breaks = []
  1125. while self.peek() in u' \r\n\x85\u2028\u2029':
  1126. if self.peek() == ' ':
  1127. self.forward()
  1128. else:
  1129. breaks.append(self.scan_line_break())
  1130. prefix = self.prefix(3)
  1131. if (prefix == u'---' or prefix == u'...') \
  1132. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  1133. return
  1134. if line_break != u'\n':
  1135. chunks.append(line_break)
  1136. elif not breaks:
  1137. chunks.append(u' ')
  1138. chunks.extend(breaks)
  1139. elif whitespaces:
  1140. chunks.append(whitespaces)
  1141. return chunks
  1142. def scan_tag_handle(self, name, start_mark):
  1143. # See the specification for details.
  1144. # For some strange reasons, the specification does not allow '_' in
  1145. # tag handles. I have allowed it anyway.
  1146. ch = self.peek()
  1147. if ch != u'!':
  1148. raise ScannerError("while scanning a %s" % name, start_mark,
  1149. "expected '!', but found %r" % ch.encode('utf-8'),
  1150. self.get_mark())
  1151. length = 1
  1152. ch = self.peek(length)
  1153. if ch != u' ':
  1154. while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
  1155. or ch in u'-_':
  1156. length += 1
  1157. ch = self.peek(length)
  1158. if ch != u'!':
  1159. self.forward(length)
  1160. raise ScannerError("while scanning a %s" % name, start_mark,
  1161. "expected '!', but found %r" % ch.encode('utf-8'),
  1162. self.get_mark())
  1163. length += 1
  1164. value = self.prefix(length)
  1165. self.forward(length)
  1166. return value
  1167. def scan_tag_uri(self, name, start_mark):
  1168. # See the specification for details.
  1169. # Note: we do not check if URI is well-formed.
  1170. chunks = []
  1171. length = 0
  1172. ch = self.peek(length)
  1173. while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
  1174. or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
  1175. if ch == u'%':
  1176. chunks.append(self.prefix(length))
  1177. self.forward(length)
  1178. length = 0
  1179. chunks.append(self.scan_uri_escapes(name, start_mark))
  1180. else:
  1181. length += 1
  1182. ch = self.peek(length)
  1183. if length:
  1184. chunks.append(self.prefix(length))
  1185. self.forward(length)
  1186. length = 0
  1187. if not chunks:
  1188. raise ScannerError("while parsing a %s" % name, start_mark,
  1189. "expected URI, but found %r" % ch.encode('utf-8'),
  1190. self.get_mark())
  1191. return u''.join(chunks)
  1192. def scan_uri_escapes(self, name, start_mark):
  1193. # See the specification for details.
  1194. bytes = []
  1195. mark = self.get_mark()
  1196. while self.peek() == u'%':
  1197. self.forward()
  1198. for k in range(2):
  1199. if self.peek(k) not in u'0123456789ABCDEFabcdef':
  1200. raise ScannerError("while scanning a %s" % name, start_mark,
  1201. "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
  1202. (self.peek(k).encode('utf-8')), self.get_mark())
  1203. bytes.append(chr(int(self.prefix(2), 16)))
  1204. self.forward(2)
  1205. try:
  1206. value = unicode(''.join(bytes), 'utf-8')
  1207. except UnicodeDecodeError, exc:
  1208. raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
  1209. return value
  1210. def scan_line_break(self):
  1211. # Transforms:
  1212. # '\r\n' : '\n'
  1213. # '\r' : '\n'
  1214. # '\n' : '\n'
  1215. # '\x85' : '\n'
  1216. # '\u2028' : '\u2028'
  1217. # '\u2029 : '\u2029'
  1218. # default : ''
  1219. ch = self.peek()
  1220. if ch in u'\r\n\x85':
  1221. if self.prefix(2) == u'\r\n':
  1222. self.forward(2)
  1223. else:
  1224. self.forward()
  1225. return u'\n'
  1226. elif ch in u'\u2028\u2029':
  1227. self.forward()
  1228. return ch
  1229. return u''