scanner.py 86 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359
  1. # coding: utf-8
  2. # Scanner produces tokens of the following types:
  3. # STREAM-START
  4. # STREAM-END
  5. # DIRECTIVE(name, value)
  6. # DOCUMENT-START
  7. # DOCUMENT-END
  8. # BLOCK-SEQUENCE-START
  9. # BLOCK-MAPPING-START
  10. # BLOCK-END
  11. # FLOW-SEQUENCE-START
  12. # FLOW-MAPPING-START
  13. # FLOW-SEQUENCE-END
  14. # FLOW-MAPPING-END
  15. # BLOCK-ENTRY
  16. # FLOW-ENTRY
  17. # KEY
  18. # VALUE
  19. # ALIAS(value)
  20. # ANCHOR(value)
  21. # TAG(value)
  22. # SCALAR(value, plain, style)
  23. #
  24. # RoundTripScanner
  25. # COMMENT(value)
  26. #
  27. # Read comments in the Scanner code for more details.
  28. #
  29. import inspect
  30. from ruamel.yaml.error import MarkedYAMLError, CommentMark # NOQA
  31. from ruamel.yaml.tokens import * # NOQA
  32. from ruamel.yaml.compat import check_anchorname_char, nprint, nprintf # NOQA
  33. from typing import Any, Dict, Optional, List, Union, Text # NOQA
  34. from ruamel.yaml.compat import VersionType # NOQA
  35. __all__ = ['Scanner', 'RoundTripScanner', 'ScannerError']
  36. _THE_END = '\n\0\r\x85\u2028\u2029'
  37. _THE_END_SPACE_TAB = ' \n\0\t\r\x85\u2028\u2029'
  38. _SPACE_TAB = ' \t'
  39. def xprintf(*args: Any, **kw: Any) -> Any:
  40. return nprintf(*args, **kw)
  41. pass
  42. class ScannerError(MarkedYAMLError):
  43. pass
  44. class SimpleKey:
  45. # See below simple keys treatment.
  46. def __init__(
  47. self, token_number: Any, required: Any, index: int, line: int, column: int, mark: Any,
  48. ) -> None:
  49. self.token_number = token_number
  50. self.required = required
  51. self.index = index
  52. self.line = line
  53. self.column = column
  54. self.mark = mark
  55. class Scanner:
  56. def __init__(self, loader: Any = None) -> None:
  57. """Initialize the scanner."""
  58. # It is assumed that Scanner and Reader will have a common descendant.
  59. # Reader do the dirty work of checking for BOM and converting the
  60. # input data to Unicode. It also adds NUL to the end.
  61. #
  62. # Reader supports the following methods
  63. # self.peek(i=0) # peek the next i-th character
  64. # self.prefix(l=1) # peek the next l characters
  65. # self.forward(l=1) # read the next l characters and move the pointer
  66. self.loader = loader
  67. if self.loader is not None and getattr(self.loader, '_scanner', None) is None:
  68. self.loader._scanner = self
  69. self.reset_scanner()
  70. self.first_time = False
  71. self.yaml_version: Any = None
  72. @property
  73. def flow_level(self) -> int:
  74. return len(self.flow_context)
  75. def reset_scanner(self) -> None:
  76. # Had we reached the end of the stream?
  77. self.done = False
  78. # flow_context is an expanding/shrinking list consisting of '{' and '['
  79. # for each unclosed flow context. If empty list that means block context
  80. self.flow_context: List[Text] = []
  81. # List of processed tokens that are not yet emitted.
  82. self.tokens: List[Any] = []
  83. # Add the STREAM-START token.
  84. self.fetch_stream_start()
  85. # Number of tokens that were emitted through the `get_token` method.
  86. self.tokens_taken = 0
  87. # The current indentation level.
  88. self.indent = -1
  89. # Past indentation levels.
  90. self.indents: List[int] = []
  91. # Variables related to simple keys treatment.
  92. # A simple key is a key that is not denoted by the '?' indicator.
  93. # Example of simple keys:
  94. # ---
  95. # block simple key: value
  96. # ? not a simple key:
  97. # : { flow simple key: value }
  98. # We emit the KEY token before all keys, so when we find a potential
  99. # simple key, we try to locate the corresponding ':' indicator.
  100. # Simple keys should be limited to a single line and 1024 characters.
  101. # Can a simple key start at the current position? A simple key may
  102. # start:
  103. # - at the beginning of the line, not counting indentation spaces
  104. # (in block context),
  105. # - after '{', '[', ',' (in the flow context),
  106. # - after '?', ':', '-' (in the block context).
  107. # In the block context, this flag also signifies if a block collection
  108. # may start at the current position.
  109. self.allow_simple_key = True
  110. # Keep track of possible simple keys. This is a dictionary. The key
  111. # is `flow_level`; there can be no more that one possible simple key
  112. # for each level. The value is a SimpleKey record:
  113. # (token_number, required, index, line, column, mark)
  114. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  115. # '[', or '{' tokens.
  116. self.possible_simple_keys: Dict[Any, Any] = {}
  117. @property
  118. def reader(self) -> Any:
  119. try:
  120. return self._scanner_reader # type: ignore
  121. except AttributeError:
  122. if hasattr(self.loader, 'typ'):
  123. self._scanner_reader = self.loader.reader
  124. else:
  125. self._scanner_reader = self.loader._reader
  126. return self._scanner_reader
  127. @property
  128. def scanner_processing_version(self) -> Any: # prefix until un-composited
  129. if hasattr(self.loader, 'typ'):
  130. return self.loader.resolver.processing_version
  131. return self.loader.processing_version
  132. # Public methods.
  133. def check_token(self, *choices: Any) -> bool:
  134. # Check if the next token is one of the given types.
  135. while self.need_more_tokens():
  136. self.fetch_more_tokens()
  137. if len(self.tokens) > 0:
  138. if not choices:
  139. return True
  140. for choice in choices:
  141. if isinstance(self.tokens[0], choice):
  142. return True
  143. return False
  144. def peek_token(self) -> Any:
  145. # Return the next token, but do not delete if from the queue.
  146. while self.need_more_tokens():
  147. self.fetch_more_tokens()
  148. if len(self.tokens) > 0:
  149. return self.tokens[0]
  150. def get_token(self) -> Any:
  151. # Return the next token.
  152. while self.need_more_tokens():
  153. self.fetch_more_tokens()
  154. if len(self.tokens) > 0:
  155. self.tokens_taken += 1
  156. return self.tokens.pop(0)
  157. # Private methods.
  158. def need_more_tokens(self) -> bool:
  159. if self.done:
  160. return False
  161. if len(self.tokens) == 0:
  162. return True
  163. # The current token may be a potential simple key, so we
  164. # need to look further.
  165. self.stale_possible_simple_keys()
  166. if self.next_possible_simple_key() == self.tokens_taken:
  167. return True
  168. return False
  169. def fetch_comment(self, comment: Any) -> None:
  170. raise NotImplementedError
  171. def fetch_more_tokens(self) -> Any:
  172. # Eat whitespaces and comments until we reach the next token.
  173. comment = self.scan_to_next_token()
  174. if comment is not None: # never happens for base scanner
  175. return self.fetch_comment(comment)
  176. # Remove obsolete possible simple keys.
  177. self.stale_possible_simple_keys()
  178. # Compare the current indentation and column. It may add some tokens
  179. # and decrease the current indentation level.
  180. self.unwind_indent(self.reader.column)
  181. # Peek the next character.
  182. ch = self.reader.peek()
  183. # Is it the end of stream?
  184. if ch == '\0':
  185. return self.fetch_stream_end()
  186. # Is it a directive?
  187. if ch == '%' and self.check_directive():
  188. return self.fetch_directive()
  189. # Is it the document start?
  190. if ch == '-' and self.check_document_start():
  191. return self.fetch_document_start()
  192. # Is it the document end?
  193. if ch == '.' and self.check_document_end():
  194. return self.fetch_document_end()
  195. # TODO: support for BOM within a stream.
  196. # if ch == '\uFEFF':
  197. # return self.fetch_bom() <-- issue BOMToken
  198. # Note: the order of the following checks is NOT significant.
  199. # Is it the flow sequence start indicator?
  200. if ch == '[':
  201. return self.fetch_flow_sequence_start()
  202. # Is it the flow mapping start indicator?
  203. if ch == '{':
  204. return self.fetch_flow_mapping_start()
  205. # Is it the flow sequence end indicator?
  206. if ch == ']':
  207. return self.fetch_flow_sequence_end()
  208. # Is it the flow mapping end indicator?
  209. if ch == '}':
  210. return self.fetch_flow_mapping_end()
  211. # Is it the flow entry indicator?
  212. if ch == ',':
  213. return self.fetch_flow_entry()
  214. # Is it the block entry indicator?
  215. if ch == '-' and self.check_block_entry():
  216. return self.fetch_block_entry()
  217. # Is it the key indicator?
  218. if ch == '?' and self.check_key():
  219. return self.fetch_key()
  220. # Is it the value indicator?
  221. if ch == ':' and self.check_value():
  222. return self.fetch_value()
  223. # Is it an alias?
  224. if ch == '*':
  225. return self.fetch_alias()
  226. # Is it an anchor?
  227. if ch == '&':
  228. return self.fetch_anchor()
  229. # Is it a tag?
  230. if ch == '!':
  231. return self.fetch_tag()
  232. # Is it a literal scalar?
  233. if ch == '|' and not self.flow_level:
  234. return self.fetch_literal()
  235. # Is it a folded scalar?
  236. if ch == '>' and not self.flow_level:
  237. return self.fetch_folded()
  238. # Is it a single quoted scalar?
  239. if ch == "'":
  240. return self.fetch_single()
  241. # Is it a double quoted scalar?
  242. if ch == '"':
  243. return self.fetch_double()
  244. # It must be a plain scalar then.
  245. if self.check_plain():
  246. return self.fetch_plain()
  247. # No? It's an error. Let's produce a nice error message.
  248. raise ScannerError(
  249. 'while scanning for the next token',
  250. None,
  251. f'found character {ch!r} that cannot start any token',
  252. self.reader.get_mark(),
  253. )
  254. # Simple keys treatment.
  255. def next_possible_simple_key(self) -> Any:
  256. # Return the number of the nearest possible simple key. Actually we
  257. # don't need to loop through the whole dictionary. We may replace it
  258. # with the following code:
  259. # if not self.possible_simple_keys:
  260. # return None
  261. # return self.possible_simple_keys[
  262. # min(self.possible_simple_keys.keys())].token_number
  263. min_token_number = None
  264. for level in self.possible_simple_keys:
  265. key = self.possible_simple_keys[level]
  266. if min_token_number is None or key.token_number < min_token_number:
  267. min_token_number = key.token_number
  268. return min_token_number
  269. def stale_possible_simple_keys(self) -> None:
  270. # Remove entries that are no longer possible simple keys. According to
  271. # the YAML specification, simple keys
  272. # - should be limited to a single line,
  273. # - should be no longer than 1024 characters.
  274. # Disabling this procedure will allow simple keys of any length and
  275. # height (may cause problems if indentation is broken though).
  276. for level in list(self.possible_simple_keys):
  277. key = self.possible_simple_keys[level]
  278. if key.line != self.reader.line or self.reader.index - key.index > 1024:
  279. if key.required:
  280. raise ScannerError(
  281. 'while scanning a simple key',
  282. key.mark,
  283. "could not find expected ':'",
  284. self.reader.get_mark(),
  285. )
  286. del self.possible_simple_keys[level]
  287. def save_possible_simple_key(self) -> None:
  288. # The next token may start a simple key. We check if it's possible
  289. # and save its position. This function is called for
  290. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  291. # Check if a simple key is required at the current position.
  292. required = not self.flow_level and self.indent == self.reader.column
  293. # The next token might be a simple key. Let's save it's number and
  294. # position.
  295. if self.allow_simple_key:
  296. self.remove_possible_simple_key()
  297. token_number = self.tokens_taken + len(self.tokens)
  298. key = SimpleKey(
  299. token_number,
  300. required,
  301. self.reader.index,
  302. self.reader.line,
  303. self.reader.column,
  304. self.reader.get_mark(),
  305. )
  306. self.possible_simple_keys[self.flow_level] = key
  307. def remove_possible_simple_key(self) -> None:
  308. # Remove the saved possible key position at the current flow level.
  309. if self.flow_level in self.possible_simple_keys:
  310. key = self.possible_simple_keys[self.flow_level]
  311. if key.required:
  312. raise ScannerError(
  313. 'while scanning a simple key',
  314. key.mark,
  315. "could not find expected ':'",
  316. self.reader.get_mark(),
  317. )
  318. del self.possible_simple_keys[self.flow_level]
  319. # Indentation functions.
  320. def unwind_indent(self, column: Any) -> None:
  321. # In flow context, tokens should respect indentation.
  322. # Actually the condition should be `self.indent >= column` according to
  323. # the spec. But this condition will prohibit intuitively correct
  324. # constructions such as
  325. # key : {
  326. # }
  327. # ####
  328. # if self.flow_level and self.indent > column:
  329. # raise ScannerError(None, None,
  330. # "invalid intendation or unclosed '[' or '{'",
  331. # self.reader.get_mark())
  332. # In the flow context, indentation is ignored. We make the scanner less
  333. # restrictive then specification requires.
  334. if bool(self.flow_level):
  335. return
  336. # In block context, we may need to issue the BLOCK-END tokens.
  337. while self.indent > column:
  338. mark = self.reader.get_mark()
  339. self.indent = self.indents.pop()
  340. self.tokens.append(BlockEndToken(mark, mark))
  341. def add_indent(self, column: int) -> bool:
  342. # Check if we need to increase indentation.
  343. if self.indent < column:
  344. self.indents.append(self.indent)
  345. self.indent = column
  346. return True
  347. return False
  348. # Fetchers.
  349. def fetch_stream_start(self) -> None:
  350. # We always add STREAM-START as the first token and STREAM-END as the
  351. # last token.
  352. # Read the token.
  353. mark = self.reader.get_mark()
  354. # Add STREAM-START.
  355. self.tokens.append(StreamStartToken(mark, mark, encoding=self.reader.encoding))
  356. def fetch_stream_end(self) -> None:
  357. # Set the current intendation to -1.
  358. self.unwind_indent(-1)
  359. # Reset simple keys.
  360. self.remove_possible_simple_key()
  361. self.allow_simple_key = False
  362. self.possible_simple_keys = {}
  363. # Read the token.
  364. mark = self.reader.get_mark()
  365. # Add STREAM-END.
  366. self.tokens.append(StreamEndToken(mark, mark))
  367. # The steam is finished.
  368. self.done = True
  369. def fetch_directive(self) -> None:
  370. # Set the current intendation to -1.
  371. self.unwind_indent(-1)
  372. # Reset simple keys.
  373. self.remove_possible_simple_key()
  374. self.allow_simple_key = False
  375. # Scan and add DIRECTIVE.
  376. self.tokens.append(self.scan_directive())
  377. def fetch_document_start(self) -> None:
  378. self.fetch_document_indicator(DocumentStartToken)
  379. def fetch_document_end(self) -> None:
  380. self.fetch_document_indicator(DocumentEndToken)
  381. def fetch_document_indicator(self, TokenClass: Any) -> None:
  382. # Set the current intendation to -1.
  383. self.unwind_indent(-1)
  384. # Reset simple keys. Note that there could not be a block collection
  385. # after '---'.
  386. self.remove_possible_simple_key()
  387. self.allow_simple_key = False
  388. # Add DOCUMENT-START or DOCUMENT-END.
  389. start_mark = self.reader.get_mark()
  390. self.reader.forward(3)
  391. end_mark = self.reader.get_mark()
  392. self.tokens.append(TokenClass(start_mark, end_mark))
  393. def fetch_flow_sequence_start(self) -> None:
  394. self.fetch_flow_collection_start(FlowSequenceStartToken, to_push='[')
  395. def fetch_flow_mapping_start(self) -> None:
  396. self.fetch_flow_collection_start(FlowMappingStartToken, to_push='{')
  397. def fetch_flow_collection_start(self, TokenClass: Any, to_push: Text) -> None:
  398. # '[' and '{' may start a simple key.
  399. self.save_possible_simple_key()
  400. # Increase the flow level.
  401. self.flow_context.append(to_push)
  402. # Simple keys are allowed after '[' and '{'.
  403. self.allow_simple_key = True
  404. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  405. start_mark = self.reader.get_mark()
  406. self.reader.forward()
  407. end_mark = self.reader.get_mark()
  408. self.tokens.append(TokenClass(start_mark, end_mark))
  409. def fetch_flow_sequence_end(self) -> None:
  410. self.fetch_flow_collection_end(FlowSequenceEndToken)
  411. def fetch_flow_mapping_end(self) -> None:
  412. self.fetch_flow_collection_end(FlowMappingEndToken)
  413. def fetch_flow_collection_end(self, TokenClass: Any) -> None:
  414. # Reset possible simple key on the current level.
  415. self.remove_possible_simple_key()
  416. # Decrease the flow level.
  417. try:
  418. popped = self.flow_context.pop() # NOQA
  419. except IndexError:
  420. # We must not be in a list or object.
  421. # Defer error handling to the parser.
  422. pass
  423. # No simple keys after ']' or '}'.
  424. self.allow_simple_key = False
  425. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  426. start_mark = self.reader.get_mark()
  427. self.reader.forward()
  428. end_mark = self.reader.get_mark()
  429. self.tokens.append(TokenClass(start_mark, end_mark))
  430. def fetch_flow_entry(self) -> None:
  431. # Simple keys are allowed after ','.
  432. self.allow_simple_key = True
  433. # Reset possible simple key on the current level.
  434. self.remove_possible_simple_key()
  435. # Add FLOW-ENTRY.
  436. start_mark = self.reader.get_mark()
  437. self.reader.forward()
  438. end_mark = self.reader.get_mark()
  439. self.tokens.append(FlowEntryToken(start_mark, end_mark))
  440. def fetch_block_entry(self) -> None:
  441. # Block context needs additional checks.
  442. if not self.flow_level:
  443. # Are we allowed to start a new entry?
  444. if not self.allow_simple_key:
  445. raise ScannerError(
  446. None,
  447. None,
  448. 'sequence entries are not allowed here',
  449. self.reader.get_mark(),
  450. )
  451. # We may need to add BLOCK-SEQUENCE-START.
  452. if self.add_indent(self.reader.column):
  453. mark = self.reader.get_mark()
  454. self.tokens.append(BlockSequenceStartToken(mark, mark))
  455. # It's an error for the block entry to occur in the flow context,
  456. # but we let the parser detect this.
  457. else:
  458. pass
  459. # Simple keys are allowed after '-'.
  460. self.allow_simple_key = True
  461. # Reset possible simple key on the current level.
  462. self.remove_possible_simple_key()
  463. # Add BLOCK-ENTRY.
  464. start_mark = self.reader.get_mark()
  465. self.reader.forward()
  466. end_mark = self.reader.get_mark()
  467. self.tokens.append(BlockEntryToken(start_mark, end_mark))
  468. def fetch_key(self) -> None:
  469. # Block context needs additional checks.
  470. if not self.flow_level:
  471. # Are we allowed to start a key (not nessesary a simple)?
  472. if not self.allow_simple_key:
  473. raise ScannerError(
  474. None, None, 'mapping keys are not allowed here', self.reader.get_mark(),
  475. )
  476. # We may need to add BLOCK-MAPPING-START.
  477. if self.add_indent(self.reader.column):
  478. mark = self.reader.get_mark()
  479. self.tokens.append(BlockMappingStartToken(mark, mark))
  480. # Simple keys are allowed after '?' in the block context.
  481. self.allow_simple_key = not self.flow_level
  482. # Reset possible simple key on the current level.
  483. self.remove_possible_simple_key()
  484. # Add KEY.
  485. start_mark = self.reader.get_mark()
  486. self.reader.forward()
  487. end_mark = self.reader.get_mark()
  488. self.tokens.append(KeyToken(start_mark, end_mark))
  489. def fetch_value(self) -> None:
  490. # Do we determine a simple key?
  491. if self.flow_level in self.possible_simple_keys:
  492. # Add KEY.
  493. key = self.possible_simple_keys[self.flow_level]
  494. del self.possible_simple_keys[self.flow_level]
  495. self.tokens.insert(
  496. key.token_number - self.tokens_taken, KeyToken(key.mark, key.mark),
  497. )
  498. # If this key starts a new block mapping, we need to add
  499. # BLOCK-MAPPING-START.
  500. if not self.flow_level:
  501. if self.add_indent(key.column):
  502. self.tokens.insert(
  503. key.token_number - self.tokens_taken,
  504. BlockMappingStartToken(key.mark, key.mark),
  505. )
  506. # There cannot be two simple keys one after another.
  507. self.allow_simple_key = False
  508. # It must be a part of a complex key.
  509. else:
  510. # Block context needs additional checks.
  511. # (Do we really need them? They will be caught by the parser
  512. # anyway.)
  513. if not self.flow_level:
  514. # We are allowed to start a complex value if and only if
  515. # we can start a simple key.
  516. if not self.allow_simple_key:
  517. raise ScannerError(
  518. None,
  519. None,
  520. 'mapping values are not allowed here',
  521. self.reader.get_mark(),
  522. )
  523. # If this value starts a new block mapping, we need to add
  524. # BLOCK-MAPPING-START. It will be detected as an error later by
  525. # the parser.
  526. if not self.flow_level:
  527. if self.add_indent(self.reader.column):
  528. mark = self.reader.get_mark()
  529. self.tokens.append(BlockMappingStartToken(mark, mark))
  530. # Simple keys are allowed after ':' in the block context.
  531. self.allow_simple_key = not self.flow_level
  532. # Reset possible simple key on the current level.
  533. self.remove_possible_simple_key()
  534. # Add VALUE.
  535. start_mark = self.reader.get_mark()
  536. self.reader.forward()
  537. end_mark = self.reader.get_mark()
  538. self.tokens.append(ValueToken(start_mark, end_mark))
  539. def fetch_alias(self) -> None:
  540. # ALIAS could be a simple key.
  541. self.save_possible_simple_key()
  542. # No simple keys after ALIAS.
  543. self.allow_simple_key = False
  544. # Scan and add ALIAS.
  545. self.tokens.append(self.scan_anchor(AliasToken))
  546. def fetch_anchor(self) -> None:
  547. # ANCHOR could start a simple key.
  548. self.save_possible_simple_key()
  549. # No simple keys after ANCHOR.
  550. self.allow_simple_key = False
  551. # Scan and add ANCHOR.
  552. self.tokens.append(self.scan_anchor(AnchorToken))
  553. def fetch_tag(self) -> None:
  554. # TAG could start a simple key.
  555. self.save_possible_simple_key()
  556. # No simple keys after TAG.
  557. self.allow_simple_key = False
  558. # Scan and add TAG.
  559. self.tokens.append(self.scan_tag())
  560. def fetch_literal(self) -> None:
  561. self.fetch_block_scalar(style='|')
  562. def fetch_folded(self) -> None:
  563. self.fetch_block_scalar(style='>')
  564. def fetch_block_scalar(self, style: Any) -> None:
  565. # A simple key may follow a block scalar.
  566. self.allow_simple_key = True
  567. # Reset possible simple key on the current level.
  568. self.remove_possible_simple_key()
  569. # Scan and add SCALAR.
  570. self.tokens.append(self.scan_block_scalar(style))
  571. def fetch_single(self) -> None:
  572. self.fetch_flow_scalar(style="'")
  573. def fetch_double(self) -> None:
  574. self.fetch_flow_scalar(style='"')
  575. def fetch_flow_scalar(self, style: Any) -> None:
  576. # A flow scalar could be a simple key.
  577. self.save_possible_simple_key()
  578. # No simple keys after flow scalars.
  579. self.allow_simple_key = False
  580. # Scan and add SCALAR.
  581. self.tokens.append(self.scan_flow_scalar(style))
  582. def fetch_plain(self) -> None:
  583. # A plain scalar could be a simple key.
  584. self.save_possible_simple_key()
  585. # No simple keys after plain scalars. But note that `scan_plain` will
  586. # change this flag if the scan is finished at the beginning of the
  587. # line.
  588. self.allow_simple_key = False
  589. # Scan and add SCALAR. May change `allow_simple_key`.
  590. self.tokens.append(self.scan_plain())
  591. # Checkers.
  592. def check_directive(self) -> Any:
  593. # DIRECTIVE: ^ '%' ...
  594. # The '%' indicator is already checked.
  595. if self.reader.column == 0:
  596. return True
  597. return None
  598. def check_document_start(self) -> Any:
  599. # DOCUMENT-START: ^ '---' (' '|'\n')
  600. if self.reader.column == 0:
  601. if self.reader.prefix(3) == '---' and self.reader.peek(3) in _THE_END_SPACE_TAB:
  602. return True
  603. return None
  604. def check_document_end(self) -> Any:
  605. # DOCUMENT-END: ^ '...' (' '|'\n')
  606. if self.reader.column == 0:
  607. if self.reader.prefix(3) == '...' and self.reader.peek(3) in _THE_END_SPACE_TAB:
  608. return True
  609. return None
  610. def check_block_entry(self) -> Any:
  611. # BLOCK-ENTRY: '-' (' '|'\n')
  612. return self.reader.peek(1) in _THE_END_SPACE_TAB
  613. def check_key(self) -> Any:
  614. # KEY(flow context): '?'
  615. if bool(self.flow_level):
  616. return True
  617. # KEY(block context): '?' (' '|'\n')
  618. return self.reader.peek(1) in _THE_END_SPACE_TAB
  619. def check_value(self) -> Any:
  620. # VALUE(flow context): ':'
  621. if self.scanner_processing_version == (1, 1):
  622. if bool(self.flow_level):
  623. return True
  624. else:
  625. if bool(self.flow_level):
  626. if self.flow_context[-1] == '[':
  627. if self.reader.peek(1) not in _THE_END_SPACE_TAB:
  628. return False
  629. elif self.tokens and isinstance(self.tokens[-1], ValueToken):
  630. # mapping flow context scanning a value token
  631. if self.reader.peek(1) not in _THE_END_SPACE_TAB:
  632. return False
  633. return True
  634. # VALUE(block context): ':' (' '|'\n')
  635. return self.reader.peek(1) in _THE_END_SPACE_TAB
  636. def check_plain(self) -> Any:
  637. # A plain scalar may start with any non-space character except:
  638. # '-', '?', ':', ',', '[', ']', '{', '}',
  639. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  640. # '%', '@', '`'.
  641. #
  642. # It may also start with
  643. # '-', '?', ':'
  644. # if it is followed by a non-space character.
  645. #
  646. # Note that we limit the last rule to the block context (except the
  647. # '-' character) because we want the flow context to be space
  648. # independent.
  649. srp = self.reader.peek
  650. ch = srp()
  651. if self.scanner_processing_version == (1, 1):
  652. return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`' or (
  653. srp(1) not in _THE_END_SPACE_TAB
  654. and (ch == '-' or (not self.flow_level and ch in '?:'))
  655. )
  656. # YAML 1.2
  657. if ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`':
  658. # ################### ^ ???
  659. return True
  660. ch1 = srp(1)
  661. if ch == '-' and ch1 not in _THE_END_SPACE_TAB:
  662. return True
  663. if ch == ':' and bool(self.flow_level) and ch1 not in _SPACE_TAB:
  664. return True
  665. return srp(1) not in _THE_END_SPACE_TAB and (
  666. ch == '-' or (not self.flow_level and ch in '?:')
  667. )
  668. # Scanners.
  669. def scan_to_next_token(self) -> Any:
  670. # We ignore spaces, line breaks and comments.
  671. # If we find a line break in the block context, we set the flag
  672. # `allow_simple_key` on.
  673. # The byte order mark is stripped if it's the first character in the
  674. # stream. We do not yet support BOM inside the stream as the
  675. # specification requires. Any such mark will be considered as a part
  676. # of the document.
  677. #
  678. # TODO: We need to make tab handling rules more sane. A good rule is
  679. # Tabs cannot precede tokens
  680. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  681. # KEY(block), VALUE(block), BLOCK-ENTRY
  682. # So the checking code is
  683. # if <TAB>:
  684. # self.allow_simple_keys = False
  685. # We also need to add the check for `allow_simple_keys == True` to
  686. # `unwind_indent` before issuing BLOCK-END.
  687. # Scanners for block, flow, and plain scalars need to be modified.
  688. srp = self.reader.peek
  689. srf = self.reader.forward
  690. if self.reader.index == 0 and srp() == '\uFEFF':
  691. srf()
  692. found = False
  693. _the_end = _THE_END
  694. white_space = ' \t' if self.flow_level > 0 else ' '
  695. while not found:
  696. while srp() in white_space:
  697. srf()
  698. if srp() == '#':
  699. while srp() not in _the_end:
  700. srf()
  701. if self.scan_line_break():
  702. if not self.flow_level:
  703. self.allow_simple_key = True
  704. else:
  705. found = True
  706. return None
  707. def scan_directive(self) -> Any:
  708. # See the specification for details.
  709. srp = self.reader.peek
  710. srf = self.reader.forward
  711. start_mark = self.reader.get_mark()
  712. srf()
  713. name = self.scan_directive_name(start_mark)
  714. value = None
  715. if name == 'YAML':
  716. value = self.scan_yaml_directive_value(start_mark)
  717. end_mark = self.reader.get_mark()
  718. elif name == 'TAG':
  719. value = self.scan_tag_directive_value(start_mark)
  720. end_mark = self.reader.get_mark()
  721. else:
  722. end_mark = self.reader.get_mark()
  723. while srp() not in _THE_END:
  724. srf()
  725. self.scan_directive_ignored_line(start_mark)
  726. return DirectiveToken(name, value, start_mark, end_mark)
  727. def scan_directive_name(self, start_mark: Any) -> Any:
  728. # See the specification for details.
  729. length = 0
  730. srp = self.reader.peek
  731. ch = srp(length)
  732. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_:.':
  733. length += 1
  734. ch = srp(length)
  735. if not length:
  736. raise ScannerError(
  737. 'while scanning a directive',
  738. start_mark,
  739. f'expected alphabetic or numeric character, but found {ch!r}',
  740. self.reader.get_mark(),
  741. )
  742. value = self.reader.prefix(length)
  743. self.reader.forward(length)
  744. ch = srp()
  745. if ch not in '\0 \r\n\x85\u2028\u2029':
  746. raise ScannerError(
  747. 'while scanning a directive',
  748. start_mark,
  749. f'expected alphabetic or numeric character, but found {ch!r}',
  750. self.reader.get_mark(),
  751. )
  752. return value
  753. def scan_yaml_directive_value(self, start_mark: Any) -> Any:
  754. # See the specification for details.
  755. srp = self.reader.peek
  756. srf = self.reader.forward
  757. while srp() == ' ':
  758. srf()
  759. major = self.scan_yaml_directive_number(start_mark)
  760. if srp() != '.':
  761. raise ScannerError(
  762. 'while scanning a directive',
  763. start_mark,
  764. f"expected a digit or '.', but found {srp()!r}",
  765. self.reader.get_mark(),
  766. )
  767. srf()
  768. minor = self.scan_yaml_directive_number(start_mark)
  769. if srp() not in '\0 \r\n\x85\u2028\u2029':
  770. raise ScannerError(
  771. 'while scanning a directive',
  772. start_mark,
  773. f"expected a digit or '.', but found {srp()!r}",
  774. self.reader.get_mark(),
  775. )
  776. self.yaml_version = (major, minor)
  777. return self.yaml_version
  778. def scan_yaml_directive_number(self, start_mark: Any) -> Any:
  779. # See the specification for details.
  780. srp = self.reader.peek
  781. srf = self.reader.forward
  782. ch = srp()
  783. if not ('0' <= ch <= '9'):
  784. raise ScannerError(
  785. 'while scanning a directive',
  786. start_mark,
  787. f'expected a digit, but found {ch!r}',
  788. self.reader.get_mark(),
  789. )
  790. length = 0
  791. while '0' <= srp(length) <= '9':
  792. length += 1
  793. value = int(self.reader.prefix(length))
  794. srf(length)
  795. return value
  796. def scan_tag_directive_value(self, start_mark: Any) -> Any:
  797. # See the specification for details.
  798. srp = self.reader.peek
  799. srf = self.reader.forward
  800. while srp() == ' ':
  801. srf()
  802. handle = self.scan_tag_directive_handle(start_mark)
  803. while srp() == ' ':
  804. srf()
  805. prefix = self.scan_tag_directive_prefix(start_mark)
  806. return (handle, prefix)
  807. def scan_tag_directive_handle(self, start_mark: Any) -> Any:
  808. # See the specification for details.
  809. value = self.scan_tag_handle('directive', start_mark)
  810. ch = self.reader.peek()
  811. if ch != ' ':
  812. raise ScannerError(
  813. 'while scanning a directive',
  814. start_mark,
  815. f"expected ' ', but found {ch!r}",
  816. self.reader.get_mark(),
  817. )
  818. return value
  819. def scan_tag_directive_prefix(self, start_mark: Any) -> Any:
  820. # See the specification for details.
  821. value = self.scan_tag_uri('directive', start_mark)
  822. ch = self.reader.peek()
  823. if ch not in '\0 \r\n\x85\u2028\u2029':
  824. raise ScannerError(
  825. 'while scanning a directive',
  826. start_mark,
  827. f"expected ' ', but found {ch!r}",
  828. self.reader.get_mark(),
  829. )
  830. return value
  831. def scan_directive_ignored_line(self, start_mark: Any) -> None:
  832. # See the specification for details.
  833. srp = self.reader.peek
  834. srf = self.reader.forward
  835. while srp() == ' ':
  836. srf()
  837. if srp() == '#':
  838. while srp() not in _THE_END:
  839. srf()
  840. ch = srp()
  841. if ch not in _THE_END:
  842. raise ScannerError(
  843. 'while scanning a directive',
  844. start_mark,
  845. f'expected a comment or a line break, but found {ch!r}',
  846. self.reader.get_mark(),
  847. )
  848. self.scan_line_break()
  849. def scan_anchor(self, TokenClass: Any) -> Any:
  850. # The specification does not restrict characters for anchors and
  851. # aliases. This may lead to problems, for instance, the document:
  852. # [ *alias, value ]
  853. # can be interpteted in two ways, as
  854. # [ "value" ]
  855. # and
  856. # [ *alias , "value" ]
  857. # Therefore we restrict aliases to numbers and ASCII letters.
  858. srp = self.reader.peek
  859. start_mark = self.reader.get_mark()
  860. indicator = srp()
  861. if indicator == '*':
  862. name = 'alias'
  863. else:
  864. name = 'anchor'
  865. self.reader.forward()
  866. length = 0
  867. ch = srp(length)
  868. # while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  869. # or ch in '-_':
  870. while check_anchorname_char(ch):
  871. length += 1
  872. ch = srp(length)
  873. if not length:
  874. raise ScannerError(
  875. f'while scanning an {name!s}',
  876. start_mark,
  877. f'expected alphabetic or numeric character, but found {ch!r}',
  878. self.reader.get_mark(),
  879. )
  880. value = self.reader.prefix(length)
  881. self.reader.forward(length)
  882. # ch1 = ch
  883. # ch = srp() # no need to peek, ch is already set
  884. # assert ch1 == ch
  885. if ch not in '\0 \t\r\n\x85\u2028\u2029?:,[]{}%@`':
  886. raise ScannerError(
  887. f'while scanning an {name!s}',
  888. start_mark,
  889. f'expected alphabetic or numeric character, but found {ch!r}',
  890. self.reader.get_mark(),
  891. )
  892. end_mark = self.reader.get_mark()
  893. return TokenClass(value, start_mark, end_mark)
  894. def scan_tag(self) -> Any:
  895. # See the specification for details.
  896. srp = self.reader.peek
  897. start_mark = self.reader.get_mark()
  898. ch = srp(1)
  899. short_handle = '!'
  900. if ch == '!':
  901. short_handle = '!!'
  902. self.reader.forward()
  903. srp = self.reader.peek
  904. ch = srp(1)
  905. if ch == '<':
  906. handle = None
  907. self.reader.forward(2)
  908. suffix = self.scan_tag_uri('tag', start_mark)
  909. if srp() != '>':
  910. raise ScannerError(
  911. 'while parsing a tag',
  912. start_mark,
  913. f"expected '>' but found {srp()!r}",
  914. self.reader.get_mark(),
  915. )
  916. self.reader.forward()
  917. elif ch in _THE_END_SPACE_TAB:
  918. handle = None
  919. suffix = short_handle
  920. self.reader.forward()
  921. else:
  922. length = 1
  923. use_handle = False
  924. while ch not in '\0 \r\n\x85\u2028\u2029':
  925. if ch == '!':
  926. use_handle = True
  927. break
  928. length += 1
  929. ch = srp(length)
  930. handle = short_handle
  931. if use_handle:
  932. handle = self.scan_tag_handle('tag', start_mark)
  933. else:
  934. handle = short_handle
  935. self.reader.forward()
  936. suffix = self.scan_tag_uri('tag', start_mark)
  937. ch = srp()
  938. if ch not in '\0 \r\n\x85\u2028\u2029':
  939. raise ScannerError(
  940. 'while scanning a tag',
  941. start_mark,
  942. f"expected ' ', but found {ch!r}",
  943. self.reader.get_mark(),
  944. )
  945. value = (handle, suffix)
  946. end_mark = self.reader.get_mark()
  947. return TagToken(value, start_mark, end_mark)
  948. def scan_block_scalar(self, style: Any, rt: Optional[bool] = False) -> Any:
  949. # See the specification for details.
  950. srp = self.reader.peek
  951. if style == '>':
  952. folded = True
  953. else:
  954. folded = False
  955. chunks: List[Any] = []
  956. start_mark = self.reader.get_mark()
  957. # Scan the header.
  958. self.reader.forward()
  959. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  960. # block scalar comment e.g. : |+ # comment text
  961. block_scalar_comment = self.scan_block_scalar_ignored_line(start_mark)
  962. # Determine the indentation level and go to the first non-empty line.
  963. min_indent = self.indent + 1
  964. if increment is None:
  965. # no increment and top level, min_indent could be 0
  966. if min_indent < 1 and (
  967. style not in '|>'
  968. or (self.scanner_processing_version == (1, 1))
  969. and getattr(
  970. self.loader, 'top_level_block_style_scalar_no_indent_error_1_1', False,
  971. )
  972. ):
  973. min_indent = 1
  974. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  975. indent = max(min_indent, max_indent)
  976. else:
  977. if min_indent < 1:
  978. min_indent = 1
  979. indent = min_indent + increment - 1
  980. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  981. line_break = ""
  982. # Scan the inner part of the block scalar.
  983. while self.reader.column == indent and srp() != '\0':
  984. chunks.extend(breaks)
  985. leading_non_space = srp() not in ' \t'
  986. length = 0
  987. while srp(length) not in _THE_END:
  988. length += 1
  989. chunks.append(self.reader.prefix(length))
  990. self.reader.forward(length)
  991. line_break = self.scan_line_break()
  992. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  993. if style in '|>' and min_indent == 0:
  994. # at the beginning of a line, if in block style see if
  995. # end of document/start_new_document
  996. if self.check_document_start() or self.check_document_end():
  997. break
  998. if self.reader.column == indent and srp() != '\0':
  999. # Unfortunately, folding rules are ambiguous.
  1000. #
  1001. # This is the folding according to the specification:
  1002. if rt and folded and line_break == '\n':
  1003. chunks.append('\a')
  1004. if folded and line_break == '\n' and leading_non_space and srp() not in ' \t':
  1005. if not breaks:
  1006. chunks.append(' ')
  1007. else:
  1008. chunks.append(line_break)
  1009. # This is Clark Evans's interpretation (also in the spec
  1010. # examples):
  1011. #
  1012. # if folded and line_break == '\n':
  1013. # if not breaks:
  1014. # if srp() not in ' \t':
  1015. # chunks.append(' ')
  1016. # else:
  1017. # chunks.append(line_break)
  1018. # else:
  1019. # chunks.append(line_break)
  1020. else:
  1021. break
  1022. # Process trailing line breaks. The 'chomping' setting determines
  1023. # whether they are included in the value.
  1024. trailing: List[Any] = []
  1025. if chomping in [None, True]:
  1026. chunks.append(line_break)
  1027. if chomping is True:
  1028. chunks.extend(breaks)
  1029. elif chomping in [None, False]:
  1030. trailing.extend(breaks)
  1031. # We are done.
  1032. token = ScalarToken("".join(chunks), False, start_mark, end_mark, style)
  1033. if self.loader is not None:
  1034. comment_handler = getattr(self.loader, 'comment_handling', False)
  1035. if comment_handler is None:
  1036. if block_scalar_comment is not None:
  1037. token.add_pre_comments([block_scalar_comment])
  1038. if len(trailing) > 0:
  1039. # Eat whitespaces and comments until we reach the next token.
  1040. if self.loader is not None:
  1041. comment_handler = getattr(self.loader, 'comment_handling', None)
  1042. if comment_handler is not None:
  1043. line = end_mark.line - len(trailing)
  1044. for x in trailing:
  1045. assert x[-1] == '\n'
  1046. self.comments.add_blank_line(x, 0, line) # type: ignore
  1047. line += 1
  1048. comment = self.scan_to_next_token()
  1049. while comment:
  1050. trailing.append(' ' * comment[1].column + comment[0])
  1051. comment = self.scan_to_next_token()
  1052. if self.loader is not None:
  1053. comment_handler = getattr(self.loader, 'comment_handling', False)
  1054. if comment_handler is None:
  1055. # Keep track of the trailing whitespace and following comments
  1056. # as a comment token, if isn't all included in the actual value.
  1057. comment_end_mark = self.reader.get_mark()
  1058. comment = CommentToken("".join(trailing), end_mark, comment_end_mark)
  1059. token.add_post_comment(comment)
  1060. return token
  1061. def scan_block_scalar_indicators(self, start_mark: Any) -> Any:
  1062. # See the specification for details.
  1063. srp = self.reader.peek
  1064. chomping = None
  1065. increment = None
  1066. ch = srp()
  1067. if ch in '+-':
  1068. if ch == '+':
  1069. chomping = True
  1070. else:
  1071. chomping = False
  1072. self.reader.forward()
  1073. ch = srp()
  1074. if ch in '0123456789':
  1075. increment = int(ch)
  1076. if increment == 0:
  1077. raise ScannerError(
  1078. 'while scanning a block scalar',
  1079. start_mark,
  1080. 'expected indentation indicator in the range 1-9, ' 'but found 0',
  1081. self.reader.get_mark(),
  1082. )
  1083. self.reader.forward()
  1084. elif ch in '0123456789':
  1085. increment = int(ch)
  1086. if increment == 0:
  1087. raise ScannerError(
  1088. 'while scanning a block scalar',
  1089. start_mark,
  1090. 'expected indentation indicator in the range 1-9, ' 'but found 0',
  1091. self.reader.get_mark(),
  1092. )
  1093. self.reader.forward()
  1094. ch = srp()
  1095. if ch in '+-':
  1096. if ch == '+':
  1097. chomping = True
  1098. else:
  1099. chomping = False
  1100. self.reader.forward()
  1101. ch = srp()
  1102. if ch not in '\0 \r\n\x85\u2028\u2029':
  1103. raise ScannerError(
  1104. 'while scanning a block scalar',
  1105. start_mark,
  1106. f'expected chomping or indentation indicators, but found {ch!r}',
  1107. self.reader.get_mark(),
  1108. )
  1109. return chomping, increment
  1110. def scan_block_scalar_ignored_line(self, start_mark: Any) -> Any:
  1111. # See the specification for details.
  1112. srp = self.reader.peek
  1113. srf = self.reader.forward
  1114. prefix = ''
  1115. comment = None
  1116. while srp() == ' ':
  1117. prefix += srp()
  1118. srf()
  1119. if srp() == '#':
  1120. comment = prefix
  1121. while srp() not in _THE_END:
  1122. comment += srp()
  1123. srf()
  1124. ch = srp()
  1125. if ch not in _THE_END:
  1126. raise ScannerError(
  1127. 'while scanning a block scalar',
  1128. start_mark,
  1129. f'expected a comment or a line break, but found {ch!r}',
  1130. self.reader.get_mark(),
  1131. )
  1132. self.scan_line_break()
  1133. return comment
  1134. def scan_block_scalar_indentation(self) -> Any:
  1135. # See the specification for details.
  1136. srp = self.reader.peek
  1137. srf = self.reader.forward
  1138. chunks = []
  1139. first_indent = -1
  1140. max_indent = 0
  1141. end_mark = self.reader.get_mark()
  1142. while srp() in ' \r\n\x85\u2028\u2029':
  1143. if srp() != ' ':
  1144. if first_indent < 0:
  1145. first_indent = self.reader.column
  1146. chunks.append(self.scan_line_break())
  1147. end_mark = self.reader.get_mark()
  1148. else:
  1149. srf()
  1150. if self.reader.column > max_indent:
  1151. max_indent = self.reader.column
  1152. if first_indent > 0 and max_indent > first_indent:
  1153. start_mark = self.reader.get_mark()
  1154. raise ScannerError(
  1155. 'more indented follow up line than first in a block scalar', start_mark,
  1156. )
  1157. return chunks, max_indent, end_mark
  1158. def scan_block_scalar_breaks(self, indent: int) -> Any:
  1159. # See the specification for details.
  1160. chunks = []
  1161. srp = self.reader.peek
  1162. srf = self.reader.forward
  1163. end_mark = self.reader.get_mark()
  1164. while self.reader.column < indent and srp() == ' ':
  1165. srf()
  1166. while srp() in '\r\n\x85\u2028\u2029':
  1167. chunks.append(self.scan_line_break())
  1168. end_mark = self.reader.get_mark()
  1169. while self.reader.column < indent and srp() == ' ':
  1170. srf()
  1171. return chunks, end_mark
  1172. def scan_flow_scalar(self, style: Any) -> Any:
  1173. # See the specification for details.
  1174. # Note that we loose indentation rules for quoted scalars. Quoted
  1175. # scalars don't need to adhere indentation because " and ' clearly
  1176. # mark the beginning and the end of them. Therefore we are less
  1177. # restrictive then the specification requires. We only need to check
  1178. # that document separators are not included in scalars.
  1179. if style == '"':
  1180. double = True
  1181. else:
  1182. double = False
  1183. srp = self.reader.peek
  1184. chunks: List[Any] = []
  1185. start_mark = self.reader.get_mark()
  1186. quote = srp()
  1187. self.reader.forward()
  1188. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  1189. while srp() != quote:
  1190. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  1191. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  1192. self.reader.forward()
  1193. end_mark = self.reader.get_mark()
  1194. return ScalarToken("".join(chunks), False, start_mark, end_mark, style)
  1195. ESCAPE_REPLACEMENTS = {
  1196. '0': '\0',
  1197. 'a': '\x07',
  1198. 'b': '\x08',
  1199. 't': '\x09',
  1200. '\t': '\x09',
  1201. 'n': '\x0A',
  1202. 'v': '\x0B',
  1203. 'f': '\x0C',
  1204. 'r': '\x0D',
  1205. 'e': '\x1B',
  1206. ' ': '\x20',
  1207. '"': '"',
  1208. '/': '/', # as per http://www.json.org/
  1209. '\\': '\\',
  1210. 'N': '\x85',
  1211. '_': '\xA0',
  1212. 'L': '\u2028',
  1213. 'P': '\u2029',
  1214. }
  1215. ESCAPE_CODES = {'x': 2, 'u': 4, 'U': 8}
  1216. def scan_flow_scalar_non_spaces(self, double: Any, start_mark: Any) -> Any:
  1217. # See the specification for details.
  1218. chunks: List[Any] = []
  1219. srp = self.reader.peek
  1220. srf = self.reader.forward
  1221. while True:
  1222. length = 0
  1223. while srp(length) not in ' \n\'"\\\0\t\r\x85\u2028\u2029':
  1224. length += 1
  1225. if length != 0:
  1226. chunks.append(self.reader.prefix(length))
  1227. srf(length)
  1228. ch = srp()
  1229. if not double and ch == "'" and srp(1) == "'":
  1230. chunks.append("'")
  1231. srf(2)
  1232. elif (double and ch == "'") or (not double and ch in '"\\'):
  1233. chunks.append(ch)
  1234. srf()
  1235. elif double and ch == '\\':
  1236. srf()
  1237. ch = srp()
  1238. if ch in self.ESCAPE_REPLACEMENTS:
  1239. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  1240. srf()
  1241. elif ch in self.ESCAPE_CODES:
  1242. length = self.ESCAPE_CODES[ch]
  1243. srf()
  1244. for k in range(length):
  1245. if srp(k) not in '0123456789ABCDEFabcdef':
  1246. raise ScannerError(
  1247. 'while scanning a double-quoted scalar',
  1248. start_mark,
  1249. f'expected escape sequence of {length:d} '
  1250. f'hexdecimal numbers, but found {srp(k)!r}',
  1251. self.reader.get_mark(),
  1252. )
  1253. code = int(self.reader.prefix(length), 16)
  1254. chunks.append(chr(code))
  1255. srf(length)
  1256. elif ch in '\n\r\x85\u2028\u2029':
  1257. self.scan_line_break()
  1258. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1259. else:
  1260. raise ScannerError(
  1261. 'while scanning a double-quoted scalar',
  1262. start_mark,
  1263. f'found unknown escape character {ch!r}',
  1264. self.reader.get_mark(),
  1265. )
  1266. else:
  1267. return chunks
  1268. def scan_flow_scalar_spaces(self, double: Any, start_mark: Any) -> Any:
  1269. # See the specification for details.
  1270. srp = self.reader.peek
  1271. chunks = []
  1272. length = 0
  1273. while srp(length) in ' \t':
  1274. length += 1
  1275. whitespaces = self.reader.prefix(length)
  1276. self.reader.forward(length)
  1277. ch = srp()
  1278. if ch == '\0':
  1279. raise ScannerError(
  1280. 'while scanning a quoted scalar',
  1281. start_mark,
  1282. 'found unexpected end of stream',
  1283. self.reader.get_mark(),
  1284. )
  1285. elif ch in '\r\n\x85\u2028\u2029':
  1286. line_break = self.scan_line_break()
  1287. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1288. if line_break != '\n':
  1289. chunks.append(line_break)
  1290. elif not breaks:
  1291. chunks.append(' ')
  1292. chunks.extend(breaks)
  1293. else:
  1294. chunks.append(whitespaces)
  1295. return chunks
  1296. def scan_flow_scalar_breaks(self, double: Any, start_mark: Any) -> Any:
  1297. # See the specification for details.
  1298. chunks: List[Any] = []
  1299. srp = self.reader.peek
  1300. srf = self.reader.forward
  1301. while True:
  1302. # Instead of checking indentation, we check for document
  1303. # separators.
  1304. prefix = self.reader.prefix(3)
  1305. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1306. raise ScannerError(
  1307. 'while scanning a quoted scalar',
  1308. start_mark,
  1309. 'found unexpected document separator',
  1310. self.reader.get_mark(),
  1311. )
  1312. while srp() in ' \t':
  1313. srf()
  1314. if srp() in '\r\n\x85\u2028\u2029':
  1315. chunks.append(self.scan_line_break())
  1316. else:
  1317. return chunks
  1318. def scan_plain(self) -> Any:
  1319. # See the specification for details.
  1320. # We add an additional restriction for the flow context:
  1321. # plain scalars in the flow context cannot contain ',', ': ' and '?'.
  1322. # We also keep track of the `allow_simple_key` flag here.
  1323. # Indentation rules are loosed for the flow context.
  1324. srp = self.reader.peek
  1325. srf = self.reader.forward
  1326. chunks: List[Any] = []
  1327. start_mark = self.reader.get_mark()
  1328. end_mark = start_mark
  1329. indent = self.indent + 1
  1330. # We allow zero indentation for scalars, but then we need to check for
  1331. # document separators at the beginning of the line.
  1332. # if indent == 0:
  1333. # indent = 1
  1334. spaces: List[Any] = []
  1335. while True:
  1336. length = 0
  1337. if srp() == '#':
  1338. break
  1339. while True:
  1340. ch = srp(length)
  1341. if False and ch == ':' and srp(length + 1) == ',':
  1342. break
  1343. elif ch == ':' and srp(length + 1) not in _THE_END_SPACE_TAB:
  1344. pass
  1345. elif ch == '?' and self.scanner_processing_version != (1, 1):
  1346. pass
  1347. elif (
  1348. ch in _THE_END_SPACE_TAB
  1349. or (
  1350. not self.flow_level
  1351. and ch == ':'
  1352. and srp(length + 1) in _THE_END_SPACE_TAB
  1353. )
  1354. or (self.flow_level and ch in ',:?[]{}')
  1355. ):
  1356. break
  1357. length += 1
  1358. # It's not clear what we should do with ':' in the flow context.
  1359. if (
  1360. self.flow_level
  1361. and ch == ':'
  1362. and srp(length + 1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}'
  1363. ):
  1364. srf(length)
  1365. raise ScannerError(
  1366. 'while scanning a plain scalar',
  1367. start_mark,
  1368. "found unexpected ':'",
  1369. self.reader.get_mark(),
  1370. 'Please check '
  1371. 'http://pyyaml.org/wiki/YAMLColonInFlowContext '
  1372. 'for details.',
  1373. )
  1374. if length == 0:
  1375. break
  1376. self.allow_simple_key = False
  1377. chunks.extend(spaces)
  1378. chunks.append(self.reader.prefix(length))
  1379. srf(length)
  1380. end_mark = self.reader.get_mark()
  1381. spaces = self.scan_plain_spaces(indent, start_mark)
  1382. if (
  1383. not spaces
  1384. or srp() == '#'
  1385. or (not self.flow_level and self.reader.column < indent)
  1386. ):
  1387. break
  1388. token = ScalarToken("".join(chunks), True, start_mark, end_mark)
  1389. # getattr provides True so C type loader, which cannot handle comment,
  1390. # will not make CommentToken
  1391. if self.loader is not None:
  1392. comment_handler = getattr(self.loader, 'comment_handling', False)
  1393. if comment_handler is None:
  1394. if spaces and spaces[0] == '\n':
  1395. # Create a comment token to preserve the trailing line breaks.
  1396. comment = CommentToken("".join(spaces) + '\n', start_mark, end_mark)
  1397. token.add_post_comment(comment)
  1398. elif comment_handler is not False:
  1399. line = start_mark.line + 1
  1400. for ch in spaces:
  1401. if ch == '\n':
  1402. self.comments.add_blank_line('\n', 0, line) # type: ignore
  1403. line += 1
  1404. return token
  1405. def scan_plain_spaces(self, indent: Any, start_mark: Any) -> Any:
  1406. # See the specification for details.
  1407. # The specification is really confusing about tabs in plain scalars.
  1408. # We just forbid them completely. Do not use tabs in YAML!
  1409. srp = self.reader.peek
  1410. srf = self.reader.forward
  1411. chunks = []
  1412. length = 0
  1413. while srp(length) in ' ':
  1414. length += 1
  1415. whitespaces = self.reader.prefix(length)
  1416. self.reader.forward(length)
  1417. ch = srp()
  1418. if ch in '\r\n\x85\u2028\u2029':
  1419. line_break = self.scan_line_break()
  1420. self.allow_simple_key = True
  1421. prefix = self.reader.prefix(3)
  1422. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1423. return
  1424. breaks = []
  1425. while srp() in ' \r\n\x85\u2028\u2029':
  1426. if srp() == ' ':
  1427. srf()
  1428. else:
  1429. breaks.append(self.scan_line_break())
  1430. prefix = self.reader.prefix(3)
  1431. if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB:
  1432. return
  1433. if line_break != '\n':
  1434. chunks.append(line_break)
  1435. elif not breaks:
  1436. chunks.append(' ')
  1437. chunks.extend(breaks)
  1438. elif whitespaces:
  1439. chunks.append(whitespaces)
  1440. return chunks
  1441. def scan_tag_handle(self, name: Any, start_mark: Any) -> Any:
  1442. # See the specification for details.
  1443. # For some strange reasons, the specification does not allow '_' in
  1444. # tag handles. I have allowed it anyway.
  1445. srp = self.reader.peek
  1446. ch = srp()
  1447. if ch != '!':
  1448. raise ScannerError(
  1449. f'while scanning an {name!s}',
  1450. start_mark,
  1451. f"expected '!', but found {ch!r}",
  1452. self.reader.get_mark(),
  1453. )
  1454. length = 1
  1455. ch = srp(length)
  1456. if ch != ' ':
  1457. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_':
  1458. length += 1
  1459. ch = srp(length)
  1460. if ch != '!':
  1461. self.reader.forward(length)
  1462. raise ScannerError(
  1463. f'while scanning an {name!s}',
  1464. start_mark,
  1465. f"expected '!' but found {ch!r}",
  1466. self.reader.get_mark(),
  1467. )
  1468. length += 1
  1469. value = self.reader.prefix(length)
  1470. self.reader.forward(length)
  1471. return value
  1472. def scan_tag_uri(self, name: Any, start_mark: Any) -> Any:
  1473. # See the specification for details.
  1474. # Note: we do not check if URI is well-formed.
  1475. srp = self.reader.peek
  1476. chunks = []
  1477. length = 0
  1478. ch = srp(length)
  1479. while (
  1480. '0' <= ch <= '9'
  1481. or 'A' <= ch <= 'Z'
  1482. or 'a' <= ch <= 'z'
  1483. or ch in "-;/?:@&=+$,_.!~*'()[]%"
  1484. or ((self.scanner_processing_version > (1, 1)) and ch == '#')
  1485. ):
  1486. if ch == '%':
  1487. chunks.append(self.reader.prefix(length))
  1488. self.reader.forward(length)
  1489. length = 0
  1490. chunks.append(self.scan_uri_escapes(name, start_mark))
  1491. else:
  1492. length += 1
  1493. ch = srp(length)
  1494. if length != 0:
  1495. chunks.append(self.reader.prefix(length))
  1496. self.reader.forward(length)
  1497. length = 0
  1498. if not chunks:
  1499. raise ScannerError(
  1500. f'while parsing an {name!s}',
  1501. start_mark,
  1502. f'expected URI, but found {ch!r}',
  1503. self.reader.get_mark(),
  1504. )
  1505. return "".join(chunks)
  1506. def scan_uri_escapes(self, name: Any, start_mark: Any) -> Any:
  1507. # See the specification for details.
  1508. srp = self.reader.peek
  1509. srf = self.reader.forward
  1510. code_bytes: List[Any] = []
  1511. mark = self.reader.get_mark()
  1512. while srp() == '%':
  1513. srf()
  1514. for k in range(2):
  1515. if srp(k) not in '0123456789ABCDEFabcdef':
  1516. raise ScannerError(
  1517. f'while scanning an {name!s}',
  1518. start_mark,
  1519. f'expected URI escape sequence of 2 hexdecimal numbers, '
  1520. f'but found {srp(k)!r}',
  1521. self.reader.get_mark(),
  1522. )
  1523. code_bytes.append(int(self.reader.prefix(2), 16))
  1524. srf(2)
  1525. try:
  1526. value = bytes(code_bytes).decode('utf-8')
  1527. except UnicodeDecodeError as exc:
  1528. raise ScannerError(f'while scanning an {name!s}', start_mark, str(exc), mark)
  1529. return value
  1530. def scan_line_break(self) -> Any:
  1531. # Transforms:
  1532. # '\r\n' : '\n'
  1533. # '\r' : '\n'
  1534. # '\n' : '\n'
  1535. # '\x85' : '\n'
  1536. # '\u2028' : '\u2028'
  1537. # '\u2029 : '\u2029'
  1538. # default : ''
  1539. ch = self.reader.peek()
  1540. if ch in '\r\n\x85':
  1541. if self.reader.prefix(2) == '\r\n':
  1542. self.reader.forward(2)
  1543. else:
  1544. self.reader.forward()
  1545. return '\n'
  1546. elif ch in '\u2028\u2029':
  1547. self.reader.forward()
  1548. return ch
  1549. return ""
  1550. class RoundTripScanner(Scanner):
  1551. def check_token(self, *choices: Any) -> bool:
  1552. # Check if the next token is one of the given types.
  1553. while self.need_more_tokens():
  1554. self.fetch_more_tokens()
  1555. self._gather_comments()
  1556. if len(self.tokens) > 0:
  1557. if not choices:
  1558. return True
  1559. for choice in choices:
  1560. if isinstance(self.tokens[0], choice):
  1561. return True
  1562. return False
  1563. def peek_token(self) -> Any:
  1564. # Return the next token, but do not delete if from the queue.
  1565. while self.need_more_tokens():
  1566. self.fetch_more_tokens()
  1567. self._gather_comments()
  1568. if len(self.tokens) > 0:
  1569. return self.tokens[0]
  1570. return None
  1571. def _gather_comments(self) -> Any:
  1572. """combine multiple comment lines and assign to next non-comment-token"""
  1573. comments: List[Any] = []
  1574. if not self.tokens:
  1575. return comments
  1576. if isinstance(self.tokens[0], CommentToken):
  1577. comment = self.tokens.pop(0)
  1578. self.tokens_taken += 1
  1579. comments.append(comment)
  1580. while self.need_more_tokens():
  1581. self.fetch_more_tokens()
  1582. if not self.tokens:
  1583. return comments
  1584. if isinstance(self.tokens[0], CommentToken):
  1585. self.tokens_taken += 1
  1586. comment = self.tokens.pop(0)
  1587. # nprint('dropping2', comment)
  1588. comments.append(comment)
  1589. if len(comments) >= 1:
  1590. self.tokens[0].add_pre_comments(comments)
  1591. # pull in post comment on e.g. ':'
  1592. if not self.done and len(self.tokens) < 2:
  1593. self.fetch_more_tokens()
  1594. def get_token(self) -> Any:
  1595. # Return the next token.
  1596. while self.need_more_tokens():
  1597. self.fetch_more_tokens()
  1598. self._gather_comments()
  1599. if len(self.tokens) > 0:
  1600. # nprint('tk', self.tokens)
  1601. # only add post comment to single line tokens:
  1602. # scalar, value token. FlowXEndToken, otherwise
  1603. # hidden streamtokens could get them (leave them and they will be
  1604. # pre comments for the next map/seq
  1605. if (
  1606. len(self.tokens) > 1
  1607. and isinstance(
  1608. self.tokens[0],
  1609. (ScalarToken, ValueToken, FlowSequenceEndToken, FlowMappingEndToken),
  1610. )
  1611. and isinstance(self.tokens[1], CommentToken)
  1612. and self.tokens[0].end_mark.line == self.tokens[1].start_mark.line
  1613. ):
  1614. self.tokens_taken += 1
  1615. c = self.tokens.pop(1)
  1616. self.fetch_more_tokens()
  1617. while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken):
  1618. self.tokens_taken += 1
  1619. c1 = self.tokens.pop(1)
  1620. c.value = c.value + (' ' * c1.start_mark.column) + c1.value
  1621. self.fetch_more_tokens()
  1622. self.tokens[0].add_post_comment(c)
  1623. elif (
  1624. len(self.tokens) > 1
  1625. and isinstance(self.tokens[0], ScalarToken)
  1626. and isinstance(self.tokens[1], CommentToken)
  1627. and self.tokens[0].end_mark.line != self.tokens[1].start_mark.line
  1628. ):
  1629. self.tokens_taken += 1
  1630. c = self.tokens.pop(1)
  1631. c.value = (
  1632. '\n' * (c.start_mark.line - self.tokens[0].end_mark.line)
  1633. + (' ' * c.start_mark.column)
  1634. + c.value
  1635. )
  1636. self.tokens[0].add_post_comment(c)
  1637. self.fetch_more_tokens()
  1638. while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken):
  1639. self.tokens_taken += 1
  1640. c1 = self.tokens.pop(1)
  1641. c.value = c.value + (' ' * c1.start_mark.column) + c1.value
  1642. self.fetch_more_tokens()
  1643. self.tokens_taken += 1
  1644. return self.tokens.pop(0)
  1645. return None
  1646. def fetch_comment(self, comment: Any) -> None:
  1647. value, start_mark, end_mark = comment
  1648. while value and value[-1] == ' ':
  1649. # empty line within indented key context
  1650. # no need to update end-mark, that is not used
  1651. value = value[:-1]
  1652. self.tokens.append(CommentToken(value, start_mark, end_mark))
  1653. # scanner
  1654. def scan_to_next_token(self) -> Any:
  1655. # We ignore spaces, line breaks and comments.
  1656. # If we find a line break in the block context, we set the flag
  1657. # `allow_simple_key` on.
  1658. # The byte order mark is stripped if it's the first character in the
  1659. # stream. We do not yet support BOM inside the stream as the
  1660. # specification requires. Any such mark will be considered as a part
  1661. # of the document.
  1662. #
  1663. # TODO: We need to make tab handling rules more sane. A good rule is
  1664. # Tabs cannot precede tokens
  1665. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  1666. # KEY(block), VALUE(block), BLOCK-ENTRY
  1667. # So the checking code is
  1668. # if <TAB>:
  1669. # self.allow_simple_keys = False
  1670. # We also need to add the check for `allow_simple_keys == True` to
  1671. # `unwind_indent` before issuing BLOCK-END.
  1672. # Scanners for block, flow, and plain scalars need to be modified.
  1673. srp = self.reader.peek
  1674. srf = self.reader.forward
  1675. if self.reader.index == 0 and srp() == '\uFEFF':
  1676. srf()
  1677. found = False
  1678. white_space = ' \t' if self.flow_level > 0 else ' '
  1679. while not found:
  1680. while srp() in white_space:
  1681. srf()
  1682. ch = srp()
  1683. if ch == '#':
  1684. start_mark = self.reader.get_mark()
  1685. comment = ch
  1686. srf()
  1687. while ch not in _THE_END:
  1688. ch = srp()
  1689. if ch == '\0': # don't gobble the end-of-stream character
  1690. # but add an explicit newline as "YAML processors should terminate
  1691. # the stream with an explicit line break
  1692. # https://yaml.org/spec/1.2/spec.html#id2780069
  1693. comment += '\n'
  1694. break
  1695. comment += ch
  1696. srf()
  1697. # gather any blank lines following the comment
  1698. ch = self.scan_line_break()
  1699. while len(ch) > 0:
  1700. comment += ch
  1701. ch = self.scan_line_break()
  1702. end_mark = self.reader.get_mark()
  1703. if not self.flow_level:
  1704. self.allow_simple_key = True
  1705. return comment, start_mark, end_mark
  1706. if self.scan_line_break() != '':
  1707. start_mark = self.reader.get_mark()
  1708. if not self.flow_level:
  1709. self.allow_simple_key = True
  1710. ch = srp()
  1711. if ch == '\n': # empty toplevel lines
  1712. start_mark = self.reader.get_mark()
  1713. comment = ""
  1714. while ch:
  1715. ch = self.scan_line_break(empty_line=True)
  1716. comment += ch
  1717. if srp() == '#':
  1718. # empty line followed by indented real comment
  1719. comment = comment.rsplit('\n', 1)[0] + '\n'
  1720. end_mark = self.reader.get_mark()
  1721. return comment, start_mark, end_mark
  1722. else:
  1723. found = True
  1724. return None
  1725. def scan_line_break(self, empty_line: bool = False) -> Text:
  1726. # Transforms:
  1727. # '\r\n' : '\n'
  1728. # '\r' : '\n'
  1729. # '\n' : '\n'
  1730. # '\x85' : '\n'
  1731. # '\u2028' : '\u2028'
  1732. # '\u2029 : '\u2029'
  1733. # default : ''
  1734. ch: Text = self.reader.peek()
  1735. if ch in '\r\n\x85':
  1736. if self.reader.prefix(2) == '\r\n':
  1737. self.reader.forward(2)
  1738. else:
  1739. self.reader.forward()
  1740. return '\n'
  1741. elif ch in '\u2028\u2029':
  1742. self.reader.forward()
  1743. return ch
  1744. elif empty_line and ch in '\t ':
  1745. self.reader.forward()
  1746. return ch
  1747. return ""
  1748. def scan_block_scalar(self, style: Any, rt: Optional[bool] = True) -> Any:
  1749. return Scanner.scan_block_scalar(self, style, rt=rt)
  1750. def scan_uri_escapes(self, name: Any, start_mark: Any) -> Any:
  1751. """
  1752. The roundtripscanner doesn't do URI escaping
  1753. """
  1754. # See the specification for details.
  1755. srp = self.reader.peek
  1756. srf = self.reader.forward
  1757. code_bytes: List[Any] = []
  1758. chunk = ''
  1759. mark = self.reader.get_mark()
  1760. while srp() == '%':
  1761. chunk += '%'
  1762. srf()
  1763. for k in range(2):
  1764. if srp(k) not in '0123456789ABCDEFabcdef':
  1765. raise ScannerError(
  1766. f'while scanning an {name!s}',
  1767. start_mark,
  1768. f'expected URI escape sequence of 2 hexdecimal numbers, '
  1769. f'but found {srp(k)!r}',
  1770. self.reader.get_mark(),
  1771. )
  1772. code_bytes.append(int(self.reader.prefix(2), 16))
  1773. chunk += self.reader.prefix(2)
  1774. srf(2)
  1775. try:
  1776. _ = bytes(code_bytes).decode('utf-8')
  1777. except UnicodeDecodeError as exc:
  1778. raise ScannerError(f'while scanning an {name!s}', start_mark, str(exc), mark)
  1779. return chunk
  1780. # commenthandling 2021, differentiatiation not needed
  1781. VALUECMNT = 0
  1782. KEYCMNT = 0 # 1
  1783. # TAGCMNT = 2
  1784. # ANCHORCMNT = 3
  1785. class CommentBase:
  1786. __slots__ = ('value', 'line', 'column', 'used', 'function', 'fline', 'ufun', 'uline')
  1787. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1788. self.value = value
  1789. self.line = line
  1790. self.column = column
  1791. self.used = ' '
  1792. info = inspect.getframeinfo(inspect.stack()[3][0])
  1793. self.function = info.function
  1794. self.fline = info.lineno
  1795. self.ufun = None
  1796. self.uline = None
  1797. def set_used(self, v: Any = '+') -> None:
  1798. self.used = v
  1799. info = inspect.getframeinfo(inspect.stack()[1][0])
  1800. self.ufun = info.function # type: ignore
  1801. self.uline = info.lineno # type: ignore
  1802. def set_assigned(self) -> None:
  1803. self.used = '|'
  1804. def __str__(self) -> str:
  1805. return f'{self.value}'
  1806. def __repr__(self) -> str:
  1807. return f'{self.value!r}'
  1808. def info(self) -> str:
  1809. xv = self.value + '"'
  1810. name = self.name # type: ignore
  1811. return (
  1812. f'{name}{self.used} {self.line:2}:{self.column:<2} "{xv:40s} '
  1813. f'{self.function}:{self.fline} {self.ufun}:{self.uline}'
  1814. )
  1815. class EOLComment(CommentBase):
  1816. name = 'EOLC'
  1817. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1818. super().__init__(value, line, column)
  1819. class FullLineComment(CommentBase):
  1820. name = 'FULL'
  1821. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1822. super().__init__(value, line, column)
  1823. class BlankLineComment(CommentBase):
  1824. name = 'BLNK'
  1825. def __init__(self, value: Any, line: Any, column: Any) -> None:
  1826. super().__init__(value, line, column)
  1827. class ScannedComments:
  1828. def __init__(self: Any) -> None:
  1829. self.comments = {} # type: ignore
  1830. self.unused = [] # type: ignore
  1831. def add_eol_comment(self, comment: Any, column: Any, line: Any) -> Any:
  1832. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1833. if comment.count('\n') == 1:
  1834. assert comment[-1] == '\n'
  1835. else:
  1836. assert '\n' not in comment
  1837. self.comments[line] = retval = EOLComment(comment[:-1], line, column)
  1838. self.unused.append(line)
  1839. return retval
  1840. def add_blank_line(self, comment: Any, column: Any, line: Any) -> Any:
  1841. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1842. assert comment.count('\n') == 1 and comment[-1] == '\n'
  1843. assert line not in self.comments
  1844. self.comments[line] = retval = BlankLineComment(comment[:-1], line, column)
  1845. self.unused.append(line)
  1846. return retval
  1847. def add_full_line_comment(self, comment: Any, column: Any, line: Any) -> Any:
  1848. # info = inspect.getframeinfo(inspect.stack()[1][0])
  1849. assert comment.count('\n') == 1 and comment[-1] == '\n'
  1850. # if comment.startswith('# C12'):
  1851. # raise
  1852. # this raises in line 2127 fro 330
  1853. self.comments[line] = retval = FullLineComment(comment[:-1], line, column)
  1854. self.unused.append(line)
  1855. return retval
  1856. def __getitem__(self, idx: Any) -> Any:
  1857. return self.comments[idx]
  1858. def __str__(self) -> Any:
  1859. return (
  1860. 'ParsedComments:\n '
  1861. + '\n '.join((f'{lineno:2} {x.info()}' for lineno, x in self.comments.items()))
  1862. + '\n'
  1863. )
  1864. def last(self) -> str:
  1865. lineno, x = list(self.comments.items())[-1]
  1866. return f'{lineno:2} {x.info()}\n'
  1867. def any_unprocessed(self) -> bool:
  1868. # ToDo: might want to differentiate based on lineno
  1869. return len(self.unused) > 0
  1870. # for lno, comment in reversed(self.comments.items()):
  1871. # if comment.used == ' ':
  1872. # return True
  1873. # return False
  1874. def unprocessed(self, use: Any = False) -> Any:
  1875. while len(self.unused) > 0:
  1876. first = self.unused.pop(0) if use else self.unused[0]
  1877. info = inspect.getframeinfo(inspect.stack()[1][0])
  1878. xprintf('using', first, self.comments[first].value, info.function, info.lineno)
  1879. yield first, self.comments[first]
  1880. if use:
  1881. self.comments[first].set_used()
  1882. def assign_pre(self, token: Any) -> Any:
  1883. token_line = token.start_mark.line
  1884. info = inspect.getframeinfo(inspect.stack()[1][0])
  1885. xprintf('assign_pre', token_line, self.unused, info.function, info.lineno)
  1886. gobbled = False
  1887. while self.unused and self.unused[0] < token_line:
  1888. gobbled = True
  1889. first = self.unused.pop(0)
  1890. xprintf('assign_pre < ', first)
  1891. self.comments[first].set_used()
  1892. token.add_comment_pre(first)
  1893. return gobbled
  1894. def assign_eol(self, tokens: Any) -> Any:
  1895. try:
  1896. comment_line = self.unused[0]
  1897. except IndexError:
  1898. return
  1899. if not isinstance(self.comments[comment_line], EOLComment):
  1900. return
  1901. idx = 1
  1902. while tokens[-idx].start_mark.line > comment_line or isinstance(
  1903. tokens[-idx], ValueToken,
  1904. ):
  1905. idx += 1
  1906. xprintf('idx1', idx)
  1907. if (
  1908. len(tokens) > idx
  1909. and isinstance(tokens[-idx], ScalarToken)
  1910. and isinstance(tokens[-(idx + 1)], ScalarToken)
  1911. ):
  1912. return
  1913. try:
  1914. if isinstance(tokens[-idx], ScalarToken) and isinstance(
  1915. tokens[-(idx + 1)], KeyToken,
  1916. ):
  1917. try:
  1918. eol_idx = self.unused.pop(0)
  1919. self.comments[eol_idx].set_used()
  1920. xprintf('>>>>>a', idx, eol_idx, KEYCMNT)
  1921. tokens[-idx].add_comment_eol(eol_idx, KEYCMNT)
  1922. except IndexError:
  1923. raise NotImplementedError
  1924. return
  1925. except IndexError:
  1926. xprintf('IndexError1')
  1927. pass
  1928. try:
  1929. if isinstance(tokens[-idx], ScalarToken) and isinstance(
  1930. tokens[-(idx + 1)], (ValueToken, BlockEntryToken),
  1931. ):
  1932. try:
  1933. eol_idx = self.unused.pop(0)
  1934. self.comments[eol_idx].set_used()
  1935. tokens[-idx].add_comment_eol(eol_idx, VALUECMNT)
  1936. except IndexError:
  1937. raise NotImplementedError
  1938. return
  1939. except IndexError:
  1940. xprintf('IndexError2')
  1941. pass
  1942. for t in tokens:
  1943. xprintf('tt-', t)
  1944. xprintf('not implemented EOL', type(tokens[-idx]))
  1945. import sys
  1946. sys.exit(0)
  1947. def assign_post(self, token: Any) -> Any:
  1948. token_line = token.start_mark.line
  1949. info = inspect.getframeinfo(inspect.stack()[1][0])
  1950. xprintf('assign_post', token_line, self.unused, info.function, info.lineno)
  1951. gobbled = False
  1952. while self.unused and self.unused[0] < token_line:
  1953. gobbled = True
  1954. first = self.unused.pop(0)
  1955. xprintf('assign_post < ', first)
  1956. self.comments[first].set_used()
  1957. token.add_comment_post(first)
  1958. return gobbled
  1959. def str_unprocessed(self) -> Any:
  1960. return ''.join(
  1961. (f' {ind:2} {x.info()}\n' for ind, x in self.comments.items() if x.used == ' '),
  1962. )
  1963. class RoundTripScannerSC(Scanner): # RoundTripScanner Split Comments
  1964. def __init__(self, *arg: Any, **kw: Any) -> None:
  1965. super().__init__(*arg, **kw)
  1966. assert self.loader is not None
  1967. # comments isinitialised on .need_more_tokens and persist on
  1968. # self.loader.parsed_comments
  1969. self.comments = None
  1970. def get_token(self) -> Any:
  1971. # Return the next token.
  1972. while self.need_more_tokens():
  1973. self.fetch_more_tokens()
  1974. if len(self.tokens) > 0:
  1975. if isinstance(self.tokens[0], BlockEndToken):
  1976. self.comments.assign_post(self.tokens[0]) # type: ignore
  1977. else:
  1978. self.comments.assign_pre(self.tokens[0]) # type: ignore
  1979. self.tokens_taken += 1
  1980. return self.tokens.pop(0)
  1981. def need_more_tokens(self) -> bool:
  1982. if self.comments is None:
  1983. self.loader.parsed_comments = self.comments = ScannedComments() # type: ignore
  1984. if self.done:
  1985. return False
  1986. if len(self.tokens) == 0:
  1987. return True
  1988. # The current token may be a potential simple key, so we
  1989. # need to look further.
  1990. self.stale_possible_simple_keys()
  1991. if self.next_possible_simple_key() == self.tokens_taken:
  1992. return True
  1993. if len(self.tokens) < 2:
  1994. return True
  1995. if self.tokens[0].start_mark.line == self.tokens[-1].start_mark.line:
  1996. return True
  1997. if True:
  1998. xprintf('-x--', len(self.tokens))
  1999. for t in self.tokens:
  2000. xprintf(t)
  2001. # xprintf(self.comments.last())
  2002. xprintf(self.comments.str_unprocessed()) # type: ignore
  2003. self.comments.assign_pre(self.tokens[0]) # type: ignore
  2004. self.comments.assign_eol(self.tokens) # type: ignore
  2005. return False
  2006. def scan_to_next_token(self) -> None:
  2007. srp = self.reader.peek
  2008. srf = self.reader.forward
  2009. if self.reader.index == 0 and srp() == '\uFEFF':
  2010. srf()
  2011. start_mark = self.reader.get_mark()
  2012. # xprintf('current_mark', start_mark.line, start_mark.column)
  2013. found = False
  2014. while not found:
  2015. while srp() == ' ':
  2016. srf()
  2017. ch = srp()
  2018. if ch == '#':
  2019. comment_start_mark = self.reader.get_mark()
  2020. comment = ch
  2021. srf() # skipt the '#'
  2022. while ch not in _THE_END:
  2023. ch = srp()
  2024. if ch == '\0': # don't gobble the end-of-stream character
  2025. # but add an explicit newline as "YAML processors should terminate
  2026. # the stream with an explicit line break
  2027. # https://yaml.org/spec/1.2/spec.html#id2780069
  2028. comment += '\n'
  2029. break
  2030. comment += ch
  2031. srf()
  2032. # we have a comment
  2033. if start_mark.column == 0:
  2034. self.comments.add_full_line_comment( # type: ignore
  2035. comment, comment_start_mark.column, comment_start_mark.line,
  2036. )
  2037. else:
  2038. self.comments.add_eol_comment( # type: ignore
  2039. comment, comment_start_mark.column, comment_start_mark.line,
  2040. )
  2041. comment = ""
  2042. # gather any blank lines or full line comments following the comment as well
  2043. self.scan_empty_or_full_line_comments()
  2044. if not self.flow_level:
  2045. self.allow_simple_key = True
  2046. return
  2047. if bool(self.scan_line_break()):
  2048. # start_mark = self.reader.get_mark()
  2049. if not self.flow_level:
  2050. self.allow_simple_key = True
  2051. self.scan_empty_or_full_line_comments()
  2052. return None
  2053. ch = srp()
  2054. if ch == '\n': # empty toplevel lines
  2055. start_mark = self.reader.get_mark()
  2056. comment = ""
  2057. while ch:
  2058. ch = self.scan_line_break(empty_line=True)
  2059. comment += ch
  2060. if srp() == '#':
  2061. # empty line followed by indented real comment
  2062. comment = comment.rsplit('\n', 1)[0] + '\n'
  2063. _ = self.reader.get_mark() # gobble end_mark
  2064. return None
  2065. else:
  2066. found = True
  2067. return None
  2068. def scan_empty_or_full_line_comments(self) -> None:
  2069. blmark = self.reader.get_mark()
  2070. assert blmark.column == 0
  2071. blanks = ""
  2072. comment = None
  2073. mark = None
  2074. ch = self.reader.peek()
  2075. while True:
  2076. # nprint('ch', repr(ch), self.reader.get_mark().column)
  2077. if ch in '\r\n\x85\u2028\u2029':
  2078. if self.reader.prefix(2) == '\r\n':
  2079. self.reader.forward(2)
  2080. else:
  2081. self.reader.forward()
  2082. if comment is not None:
  2083. comment += '\n'
  2084. self.comments.add_full_line_comment(comment, mark.column, mark.line)
  2085. comment = None
  2086. else:
  2087. blanks += '\n'
  2088. self.comments.add_blank_line(blanks, blmark.column, blmark.line) # type: ignore # NOQA
  2089. blanks = ""
  2090. blmark = self.reader.get_mark()
  2091. ch = self.reader.peek()
  2092. continue
  2093. if comment is None:
  2094. if ch in ' \t':
  2095. blanks += ch
  2096. elif ch == '#':
  2097. mark = self.reader.get_mark()
  2098. comment = '#'
  2099. else:
  2100. # xprintf('breaking on', repr(ch))
  2101. break
  2102. else:
  2103. comment += ch
  2104. self.reader.forward()
  2105. ch = self.reader.peek()
  2106. def scan_block_scalar_ignored_line(self, start_mark: Any) -> Any:
  2107. # See the specification for details.
  2108. srp = self.reader.peek
  2109. srf = self.reader.forward
  2110. prefix = ''
  2111. comment = None
  2112. while srp() == ' ':
  2113. prefix += srp()
  2114. srf()
  2115. if srp() == '#':
  2116. comment = ''
  2117. mark = self.reader.get_mark()
  2118. while srp() not in _THE_END:
  2119. comment += srp()
  2120. srf()
  2121. comment += '\n' # type: ignore
  2122. ch = srp()
  2123. if ch not in _THE_END:
  2124. raise ScannerError(
  2125. 'while scanning a block scalar',
  2126. start_mark,
  2127. f'expected a comment or a line break, but found {ch!r}',
  2128. self.reader.get_mark(),
  2129. )
  2130. if comment is not None:
  2131. self.comments.add_eol_comment(comment, mark.column, mark.line) # type: ignore
  2132. self.scan_line_break()
  2133. return None