1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435 |
- # Scanner produces tokens of the following types:
- # STREAM-START
- # STREAM-END
- # DIRECTIVE(name, value)
- # DOCUMENT-START
- # DOCUMENT-END
- # BLOCK-SEQUENCE-START
- # BLOCK-MAPPING-START
- # BLOCK-END
- # FLOW-SEQUENCE-START
- # FLOW-MAPPING-START
- # FLOW-SEQUENCE-END
- # FLOW-MAPPING-END
- # BLOCK-ENTRY
- # FLOW-ENTRY
- # KEY
- # VALUE
- # ALIAS(value)
- # ANCHOR(value)
- # TAG(value)
- # SCALAR(value, plain, style)
- #
- # Read comments in the Scanner code for more details.
- #
- __all__ = ['Scanner', 'ScannerError']
- from .error import MarkedYAMLError
- from .tokens import *
- class ScannerError(MarkedYAMLError):
- pass
- class SimpleKey:
- # See below simple keys treatment.
- def __init__(self, token_number, required, index, line, column, mark):
- self.token_number = token_number
- self.required = required
- self.index = index
- self.line = line
- self.column = column
- self.mark = mark
- class Scanner:
- def __init__(self):
- """Initialize the scanner."""
- # It is assumed that Scanner and Reader will have a common descendant.
- # Reader do the dirty work of checking for BOM and converting the
- # input data to Unicode. It also adds NUL to the end.
- #
- # Reader supports the following methods
- # self.peek(i=0) # peek the next i-th character
- # self.prefix(l=1) # peek the next l characters
- # self.forward(l=1) # read the next l characters and move the pointer.
- # Had we reached the end of the stream?
- self.done = False
- # The number of unclosed '{' and '['. `flow_level == 0` means block
- # context.
- self.flow_level = 0
- # List of processed tokens that are not yet emitted.
- self.tokens = []
- # Add the STREAM-START token.
- self.fetch_stream_start()
- # Number of tokens that were emitted through the `get_token` method.
- self.tokens_taken = 0
- # The current indentation level.
- self.indent = -1
- # Past indentation levels.
- self.indents = []
- # Variables related to simple keys treatment.
- # A simple key is a key that is not denoted by the '?' indicator.
- # Example of simple keys:
- # ---
- # block simple key: value
- # ? not a simple key:
- # : { flow simple key: value }
- # We emit the KEY token before all keys, so when we find a potential
- # simple key, we try to locate the corresponding ':' indicator.
- # Simple keys should be limited to a single line and 1024 characters.
- # Can a simple key start at the current position? A simple key may
- # start:
- # - at the beginning of the line, not counting indentation spaces
- # (in block context),
- # - after '{', '[', ',' (in the flow context),
- # - after '?', ':', '-' (in the block context).
- # In the block context, this flag also signifies if a block collection
- # may start at the current position.
- self.allow_simple_key = True
- # Keep track of possible simple keys. This is a dictionary. The key
- # is `flow_level`; there can be no more that one possible simple key
- # for each level. The value is a SimpleKey record:
- # (token_number, required, index, line, column, mark)
- # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
- # '[', or '{' tokens.
- self.possible_simple_keys = {}
- # Public methods.
- def check_token(self, *choices):
- # Check if the next token is one of the given types.
- while self.need_more_tokens():
- self.fetch_more_tokens()
- if self.tokens:
- if not choices:
- return True
- for choice in choices:
- if isinstance(self.tokens[0], choice):
- return True
- return False
- def peek_token(self):
- # Return the next token, but do not delete if from the queue.
- # Return None if no more tokens.
- while self.need_more_tokens():
- self.fetch_more_tokens()
- if self.tokens:
- return self.tokens[0]
- else:
- return None
- def get_token(self):
- # Return the next token.
- while self.need_more_tokens():
- self.fetch_more_tokens()
- if self.tokens:
- self.tokens_taken += 1
- return self.tokens.pop(0)
- # Private methods.
- def need_more_tokens(self):
- if self.done:
- return False
- if not self.tokens:
- return True
- # The current token may be a potential simple key, so we
- # need to look further.
- self.stale_possible_simple_keys()
- if self.next_possible_simple_key() == self.tokens_taken:
- return True
- def fetch_more_tokens(self):
- # Eat whitespaces and comments until we reach the next token.
- self.scan_to_next_token()
- # Remove obsolete possible simple keys.
- self.stale_possible_simple_keys()
- # Compare the current indentation and column. It may add some tokens
- # and decrease the current indentation level.
- self.unwind_indent(self.column)
- # Peek the next character.
- ch = self.peek()
- # Is it the end of stream?
- if ch == '\0':
- return self.fetch_stream_end()
- # Is it a directive?
- if ch == '%' and self.check_directive():
- return self.fetch_directive()
- # Is it the document start?
- if ch == '-' and self.check_document_start():
- return self.fetch_document_start()
- # Is it the document end?
- if ch == '.' and self.check_document_end():
- return self.fetch_document_end()
- # TODO: support for BOM within a stream.
- #if ch == '\uFEFF':
- # return self.fetch_bom() <-- issue BOMToken
- # Note: the order of the following checks is NOT significant.
- # Is it the flow sequence start indicator?
- if ch == '[':
- return self.fetch_flow_sequence_start()
- # Is it the flow mapping start indicator?
- if ch == '{':
- return self.fetch_flow_mapping_start()
- # Is it the flow sequence end indicator?
- if ch == ']':
- return self.fetch_flow_sequence_end()
- # Is it the flow mapping end indicator?
- if ch == '}':
- return self.fetch_flow_mapping_end()
- # Is it the flow entry indicator?
- if ch == ',':
- return self.fetch_flow_entry()
- # Is it the block entry indicator?
- if ch == '-' and self.check_block_entry():
- return self.fetch_block_entry()
- # Is it the key indicator?
- if ch == '?' and self.check_key():
- return self.fetch_key()
- # Is it the value indicator?
- if ch == ':' and self.check_value():
- return self.fetch_value()
- # Is it an alias?
- if ch == '*':
- return self.fetch_alias()
- # Is it an anchor?
- if ch == '&':
- return self.fetch_anchor()
- # Is it a tag?
- if ch == '!':
- return self.fetch_tag()
- # Is it a literal scalar?
- if ch == '|' and not self.flow_level:
- return self.fetch_literal()
- # Is it a folded scalar?
- if ch == '>' and not self.flow_level:
- return self.fetch_folded()
- # Is it a single quoted scalar?
- if ch == '\'':
- return self.fetch_single()
- # Is it a double quoted scalar?
- if ch == '\"':
- return self.fetch_double()
- # It must be a plain scalar then.
- if self.check_plain():
- return self.fetch_plain()
- # No? It's an error. Let's produce a nice error message.
- raise ScannerError("while scanning for the next token", None,
- "found character %r that cannot start any token" % ch,
- self.get_mark())
- # Simple keys treatment.
- def next_possible_simple_key(self):
- # Return the number of the nearest possible simple key. Actually we
- # don't need to loop through the whole dictionary. We may replace it
- # with the following code:
- # if not self.possible_simple_keys:
- # return None
- # return self.possible_simple_keys[
- # min(self.possible_simple_keys.keys())].token_number
- min_token_number = None
- for level in self.possible_simple_keys:
- key = self.possible_simple_keys[level]
- if min_token_number is None or key.token_number < min_token_number:
- min_token_number = key.token_number
- return min_token_number
- def stale_possible_simple_keys(self):
- # Remove entries that are no longer possible simple keys. According to
- # the YAML specification, simple keys
- # - should be limited to a single line,
- # - should be no longer than 1024 characters.
- # Disabling this procedure will allow simple keys of any length and
- # height (may cause problems if indentation is broken though).
- for level in list(self.possible_simple_keys):
- key = self.possible_simple_keys[level]
- if key.line != self.line \
- or self.index-key.index > 1024:
- if key.required:
- raise ScannerError("while scanning a simple key", key.mark,
- "could not find expected ':'", self.get_mark())
- del self.possible_simple_keys[level]
- def save_possible_simple_key(self):
- # The next token may start a simple key. We check if it's possible
- # and save its position. This function is called for
- # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
- # Check if a simple key is required at the current position.
- required = not self.flow_level and self.indent == self.column
- # The next token might be a simple key. Let's save it's number and
- # position.
- if self.allow_simple_key:
- self.remove_possible_simple_key()
- token_number = self.tokens_taken+len(self.tokens)
- key = SimpleKey(token_number, required,
- self.index, self.line, self.column, self.get_mark())
- self.possible_simple_keys[self.flow_level] = key
- def remove_possible_simple_key(self):
- # Remove the saved possible key position at the current flow level.
- if self.flow_level in self.possible_simple_keys:
- key = self.possible_simple_keys[self.flow_level]
-
- if key.required:
- raise ScannerError("while scanning a simple key", key.mark,
- "could not find expected ':'", self.get_mark())
- del self.possible_simple_keys[self.flow_level]
- # Indentation functions.
- def unwind_indent(self, column):
- ## In flow context, tokens should respect indentation.
- ## Actually the condition should be `self.indent >= column` according to
- ## the spec. But this condition will prohibit intuitively correct
- ## constructions such as
- ## key : {
- ## }
- #if self.flow_level and self.indent > column:
- # raise ScannerError(None, None,
- # "invalid indentation or unclosed '[' or '{'",
- # self.get_mark())
- # In the flow context, indentation is ignored. We make the scanner less
- # restrictive then specification requires.
- if self.flow_level:
- return
- # In block context, we may need to issue the BLOCK-END tokens.
- while self.indent > column:
- mark = self.get_mark()
- self.indent = self.indents.pop()
- self.tokens.append(BlockEndToken(mark, mark))
- def add_indent(self, column):
- # Check if we need to increase indentation.
- if self.indent < column:
- self.indents.append(self.indent)
- self.indent = column
- return True
- return False
- # Fetchers.
- def fetch_stream_start(self):
- # We always add STREAM-START as the first token and STREAM-END as the
- # last token.
- # Read the token.
- mark = self.get_mark()
-
- # Add STREAM-START.
- self.tokens.append(StreamStartToken(mark, mark,
- encoding=self.encoding))
-
- def fetch_stream_end(self):
- # Set the current indentation to -1.
- self.unwind_indent(-1)
- # Reset simple keys.
- self.remove_possible_simple_key()
- self.allow_simple_key = False
- self.possible_simple_keys = {}
- # Read the token.
- mark = self.get_mark()
-
- # Add STREAM-END.
- self.tokens.append(StreamEndToken(mark, mark))
- # The steam is finished.
- self.done = True
- def fetch_directive(self):
-
- # Set the current indentation to -1.
- self.unwind_indent(-1)
- # Reset simple keys.
- self.remove_possible_simple_key()
- self.allow_simple_key = False
- # Scan and add DIRECTIVE.
- self.tokens.append(self.scan_directive())
- def fetch_document_start(self):
- self.fetch_document_indicator(DocumentStartToken)
- def fetch_document_end(self):
- self.fetch_document_indicator(DocumentEndToken)
- def fetch_document_indicator(self, TokenClass):
- # Set the current indentation to -1.
- self.unwind_indent(-1)
- # Reset simple keys. Note that there could not be a block collection
- # after '---'.
- self.remove_possible_simple_key()
- self.allow_simple_key = False
- # Add DOCUMENT-START or DOCUMENT-END.
- start_mark = self.get_mark()
- self.forward(3)
- end_mark = self.get_mark()
- self.tokens.append(TokenClass(start_mark, end_mark))
- def fetch_flow_sequence_start(self):
- self.fetch_flow_collection_start(FlowSequenceStartToken)
- def fetch_flow_mapping_start(self):
- self.fetch_flow_collection_start(FlowMappingStartToken)
- def fetch_flow_collection_start(self, TokenClass):
- # '[' and '{' may start a simple key.
- self.save_possible_simple_key()
- # Increase the flow level.
- self.flow_level += 1
- # Simple keys are allowed after '[' and '{'.
- self.allow_simple_key = True
- # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(TokenClass(start_mark, end_mark))
- def fetch_flow_sequence_end(self):
- self.fetch_flow_collection_end(FlowSequenceEndToken)
- def fetch_flow_mapping_end(self):
- self.fetch_flow_collection_end(FlowMappingEndToken)
- def fetch_flow_collection_end(self, TokenClass):
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Decrease the flow level.
- self.flow_level -= 1
- # No simple keys after ']' or '}'.
- self.allow_simple_key = False
- # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(TokenClass(start_mark, end_mark))
- def fetch_flow_entry(self):
- # Simple keys are allowed after ','.
- self.allow_simple_key = True
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Add FLOW-ENTRY.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(FlowEntryToken(start_mark, end_mark))
- def fetch_block_entry(self):
- # Block context needs additional checks.
- if not self.flow_level:
- # Are we allowed to start a new entry?
- if not self.allow_simple_key:
- raise ScannerError(None, None,
- "sequence entries are not allowed here",
- self.get_mark())
- # We may need to add BLOCK-SEQUENCE-START.
- if self.add_indent(self.column):
- mark = self.get_mark()
- self.tokens.append(BlockSequenceStartToken(mark, mark))
- # It's an error for the block entry to occur in the flow context,
- # but we let the parser detect this.
- else:
- pass
- # Simple keys are allowed after '-'.
- self.allow_simple_key = True
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Add BLOCK-ENTRY.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(BlockEntryToken(start_mark, end_mark))
- def fetch_key(self):
-
- # Block context needs additional checks.
- if not self.flow_level:
- # Are we allowed to start a key (not necessary a simple)?
- if not self.allow_simple_key:
- raise ScannerError(None, None,
- "mapping keys are not allowed here",
- self.get_mark())
- # We may need to add BLOCK-MAPPING-START.
- if self.add_indent(self.column):
- mark = self.get_mark()
- self.tokens.append(BlockMappingStartToken(mark, mark))
- # Simple keys are allowed after '?' in the block context.
- self.allow_simple_key = not self.flow_level
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Add KEY.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(KeyToken(start_mark, end_mark))
- def fetch_value(self):
- # Do we determine a simple key?
- if self.flow_level in self.possible_simple_keys:
- # Add KEY.
- key = self.possible_simple_keys[self.flow_level]
- del self.possible_simple_keys[self.flow_level]
- self.tokens.insert(key.token_number-self.tokens_taken,
- KeyToken(key.mark, key.mark))
- # If this key starts a new block mapping, we need to add
- # BLOCK-MAPPING-START.
- if not self.flow_level:
- if self.add_indent(key.column):
- self.tokens.insert(key.token_number-self.tokens_taken,
- BlockMappingStartToken(key.mark, key.mark))
- # There cannot be two simple keys one after another.
- self.allow_simple_key = False
- # It must be a part of a complex key.
- else:
-
- # Block context needs additional checks.
- # (Do we really need them? They will be caught by the parser
- # anyway.)
- if not self.flow_level:
- # We are allowed to start a complex value if and only if
- # we can start a simple key.
- if not self.allow_simple_key:
- raise ScannerError(None, None,
- "mapping values are not allowed here",
- self.get_mark())
- # If this value starts a new block mapping, we need to add
- # BLOCK-MAPPING-START. It will be detected as an error later by
- # the parser.
- if not self.flow_level:
- if self.add_indent(self.column):
- mark = self.get_mark()
- self.tokens.append(BlockMappingStartToken(mark, mark))
- # Simple keys are allowed after ':' in the block context.
- self.allow_simple_key = not self.flow_level
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Add VALUE.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(ValueToken(start_mark, end_mark))
- def fetch_alias(self):
- # ALIAS could be a simple key.
- self.save_possible_simple_key()
- # No simple keys after ALIAS.
- self.allow_simple_key = False
- # Scan and add ALIAS.
- self.tokens.append(self.scan_anchor(AliasToken))
- def fetch_anchor(self):
- # ANCHOR could start a simple key.
- self.save_possible_simple_key()
- # No simple keys after ANCHOR.
- self.allow_simple_key = False
- # Scan and add ANCHOR.
- self.tokens.append(self.scan_anchor(AnchorToken))
- def fetch_tag(self):
- # TAG could start a simple key.
- self.save_possible_simple_key()
- # No simple keys after TAG.
- self.allow_simple_key = False
- # Scan and add TAG.
- self.tokens.append(self.scan_tag())
- def fetch_literal(self):
- self.fetch_block_scalar(style='|')
- def fetch_folded(self):
- self.fetch_block_scalar(style='>')
- def fetch_block_scalar(self, style):
- # A simple key may follow a block scalar.
- self.allow_simple_key = True
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Scan and add SCALAR.
- self.tokens.append(self.scan_block_scalar(style))
- def fetch_single(self):
- self.fetch_flow_scalar(style='\'')
- def fetch_double(self):
- self.fetch_flow_scalar(style='"')
- def fetch_flow_scalar(self, style):
- # A flow scalar could be a simple key.
- self.save_possible_simple_key()
- # No simple keys after flow scalars.
- self.allow_simple_key = False
- # Scan and add SCALAR.
- self.tokens.append(self.scan_flow_scalar(style))
- def fetch_plain(self):
- # A plain scalar could be a simple key.
- self.save_possible_simple_key()
- # No simple keys after plain scalars. But note that `scan_plain` will
- # change this flag if the scan is finished at the beginning of the
- # line.
- self.allow_simple_key = False
- # Scan and add SCALAR. May change `allow_simple_key`.
- self.tokens.append(self.scan_plain())
- # Checkers.
- def check_directive(self):
- # DIRECTIVE: ^ '%' ...
- # The '%' indicator is already checked.
- if self.column == 0:
- return True
- def check_document_start(self):
- # DOCUMENT-START: ^ '---' (' '|'\n')
- if self.column == 0:
- if self.prefix(3) == '---' \
- and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
- return True
- def check_document_end(self):
- # DOCUMENT-END: ^ '...' (' '|'\n')
- if self.column == 0:
- if self.prefix(3) == '...' \
- and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
- return True
- def check_block_entry(self):
- # BLOCK-ENTRY: '-' (' '|'\n')
- return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
- def check_key(self):
- # KEY(flow context): '?'
- if self.flow_level:
- return True
- # KEY(block context): '?' (' '|'\n')
- else:
- return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
- def check_value(self):
- # VALUE(flow context): ':'
- if self.flow_level:
- return True
- # VALUE(block context): ':' (' '|'\n')
- else:
- return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
- def check_plain(self):
- # A plain scalar may start with any non-space character except:
- # '-', '?', ':', ',', '[', ']', '{', '}',
- # '#', '&', '*', '!', '|', '>', '\'', '\"',
- # '%', '@', '`'.
- #
- # It may also start with
- # '-', '?', ':'
- # if it is followed by a non-space character.
- #
- # Note that we limit the last rule to the block context (except the
- # '-' character) because we want the flow context to be space
- # independent.
- ch = self.peek()
- return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
- or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029'
- and (ch == '-' or (not self.flow_level and ch in '?:')))
- # Scanners.
- def scan_to_next_token(self):
- # We ignore spaces, line breaks and comments.
- # If we find a line break in the block context, we set the flag
- # `allow_simple_key` on.
- # The byte order mark is stripped if it's the first character in the
- # stream. We do not yet support BOM inside the stream as the
- # specification requires. Any such mark will be considered as a part
- # of the document.
- #
- # TODO: We need to make tab handling rules more sane. A good rule is
- # Tabs cannot precede tokens
- # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
- # KEY(block), VALUE(block), BLOCK-ENTRY
- # So the checking code is
- # if <TAB>:
- # self.allow_simple_keys = False
- # We also need to add the check for `allow_simple_keys == True` to
- # `unwind_indent` before issuing BLOCK-END.
- # Scanners for block, flow, and plain scalars need to be modified.
- if self.index == 0 and self.peek() == '\uFEFF':
- self.forward()
- found = False
- while not found:
- while self.peek() == ' ':
- self.forward()
- if self.peek() == '#':
- while self.peek() not in '\0\r\n\x85\u2028\u2029':
- self.forward()
- if self.scan_line_break():
- if not self.flow_level:
- self.allow_simple_key = True
- else:
- found = True
- def scan_directive(self):
- # See the specification for details.
- start_mark = self.get_mark()
- self.forward()
- name = self.scan_directive_name(start_mark)
- value = None
- if name == 'YAML':
- value = self.scan_yaml_directive_value(start_mark)
- end_mark = self.get_mark()
- elif name == 'TAG':
- value = self.scan_tag_directive_value(start_mark)
- end_mark = self.get_mark()
- else:
- end_mark = self.get_mark()
- while self.peek() not in '\0\r\n\x85\u2028\u2029':
- self.forward()
- self.scan_directive_ignored_line(start_mark)
- return DirectiveToken(name, value, start_mark, end_mark)
- def scan_directive_name(self, start_mark):
- # See the specification for details.
- length = 0
- ch = self.peek(length)
- while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
- or ch in '-_':
- length += 1
- ch = self.peek(length)
- if not length:
- raise ScannerError("while scanning a directive", start_mark,
- "expected alphabetic or numeric character, but found %r"
- % ch, self.get_mark())
- value = self.prefix(length)
- self.forward(length)
- ch = self.peek()
- if ch not in '\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a directive", start_mark,
- "expected alphabetic or numeric character, but found %r"
- % ch, self.get_mark())
- return value
- def scan_yaml_directive_value(self, start_mark):
- # See the specification for details.
- while self.peek() == ' ':
- self.forward()
- major = self.scan_yaml_directive_number(start_mark)
- if self.peek() != '.':
- raise ScannerError("while scanning a directive", start_mark,
- "expected a digit or '.', but found %r" % self.peek(),
- self.get_mark())
- self.forward()
- minor = self.scan_yaml_directive_number(start_mark)
- if self.peek() not in '\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a directive", start_mark,
- "expected a digit or ' ', but found %r" % self.peek(),
- self.get_mark())
- return (major, minor)
- def scan_yaml_directive_number(self, start_mark):
- # See the specification for details.
- ch = self.peek()
- if not ('0' <= ch <= '9'):
- raise ScannerError("while scanning a directive", start_mark,
- "expected a digit, but found %r" % ch, self.get_mark())
- length = 0
- while '0' <= self.peek(length) <= '9':
- length += 1
- value = int(self.prefix(length))
- self.forward(length)
- return value
- def scan_tag_directive_value(self, start_mark):
- # See the specification for details.
- while self.peek() == ' ':
- self.forward()
- handle = self.scan_tag_directive_handle(start_mark)
- while self.peek() == ' ':
- self.forward()
- prefix = self.scan_tag_directive_prefix(start_mark)
- return (handle, prefix)
- def scan_tag_directive_handle(self, start_mark):
- # See the specification for details.
- value = self.scan_tag_handle('directive', start_mark)
- ch = self.peek()
- if ch != ' ':
- raise ScannerError("while scanning a directive", start_mark,
- "expected ' ', but found %r" % ch, self.get_mark())
- return value
- def scan_tag_directive_prefix(self, start_mark):
- # See the specification for details.
- value = self.scan_tag_uri('directive', start_mark)
- ch = self.peek()
- if ch not in '\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a directive", start_mark,
- "expected ' ', but found %r" % ch, self.get_mark())
- return value
- def scan_directive_ignored_line(self, start_mark):
- # See the specification for details.
- while self.peek() == ' ':
- self.forward()
- if self.peek() == '#':
- while self.peek() not in '\0\r\n\x85\u2028\u2029':
- self.forward()
- ch = self.peek()
- if ch not in '\0\r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a directive", start_mark,
- "expected a comment or a line break, but found %r"
- % ch, self.get_mark())
- self.scan_line_break()
- def scan_anchor(self, TokenClass):
- # The specification does not restrict characters for anchors and
- # aliases. This may lead to problems, for instance, the document:
- # [ *alias, value ]
- # can be interpreted in two ways, as
- # [ "value" ]
- # and
- # [ *alias , "value" ]
- # Therefore we restrict aliases to numbers and ASCII letters.
- start_mark = self.get_mark()
- indicator = self.peek()
- if indicator == '*':
- name = 'alias'
- else:
- name = 'anchor'
- self.forward()
- length = 0
- ch = self.peek(length)
- while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
- or ch in '-_':
- length += 1
- ch = self.peek(length)
- if not length:
- raise ScannerError("while scanning an %s" % name, start_mark,
- "expected alphabetic or numeric character, but found %r"
- % ch, self.get_mark())
- value = self.prefix(length)
- self.forward(length)
- ch = self.peek()
- if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
- raise ScannerError("while scanning an %s" % name, start_mark,
- "expected alphabetic or numeric character, but found %r"
- % ch, self.get_mark())
- end_mark = self.get_mark()
- return TokenClass(value, start_mark, end_mark)
- def scan_tag(self):
- # See the specification for details.
- start_mark = self.get_mark()
- ch = self.peek(1)
- if ch == '<':
- handle = None
- self.forward(2)
- suffix = self.scan_tag_uri('tag', start_mark)
- if self.peek() != '>':
- raise ScannerError("while parsing a tag", start_mark,
- "expected '>', but found %r" % self.peek(),
- self.get_mark())
- self.forward()
- elif ch in '\0 \t\r\n\x85\u2028\u2029':
- handle = None
- suffix = '!'
- self.forward()
- else:
- length = 1
- use_handle = False
- while ch not in '\0 \r\n\x85\u2028\u2029':
- if ch == '!':
- use_handle = True
- break
- length += 1
- ch = self.peek(length)
- handle = '!'
- if use_handle:
- handle = self.scan_tag_handle('tag', start_mark)
- else:
- handle = '!'
- self.forward()
- suffix = self.scan_tag_uri('tag', start_mark)
- ch = self.peek()
- if ch not in '\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a tag", start_mark,
- "expected ' ', but found %r" % ch, self.get_mark())
- value = (handle, suffix)
- end_mark = self.get_mark()
- return TagToken(value, start_mark, end_mark)
- def scan_block_scalar(self, style):
- # See the specification for details.
- if style == '>':
- folded = True
- else:
- folded = False
- chunks = []
- start_mark = self.get_mark()
- # Scan the header.
- self.forward()
- chomping, increment = self.scan_block_scalar_indicators(start_mark)
- self.scan_block_scalar_ignored_line(start_mark)
- # Determine the indentation level and go to the first non-empty line.
- min_indent = self.indent+1
- if min_indent < 1:
- min_indent = 1
- if increment is None:
- breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
- indent = max(min_indent, max_indent)
- else:
- indent = min_indent+increment-1
- breaks, end_mark = self.scan_block_scalar_breaks(indent)
- line_break = ''
- # Scan the inner part of the block scalar.
- while self.column == indent and self.peek() != '\0':
- chunks.extend(breaks)
- leading_non_space = self.peek() not in ' \t'
- length = 0
- while self.peek(length) not in '\0\r\n\x85\u2028\u2029':
- length += 1
- chunks.append(self.prefix(length))
- self.forward(length)
- line_break = self.scan_line_break()
- breaks, end_mark = self.scan_block_scalar_breaks(indent)
- if self.column == indent and self.peek() != '\0':
- # Unfortunately, folding rules are ambiguous.
- #
- # This is the folding according to the specification:
-
- if folded and line_break == '\n' \
- and leading_non_space and self.peek() not in ' \t':
- if not breaks:
- chunks.append(' ')
- else:
- chunks.append(line_break)
-
- # This is Clark Evans's interpretation (also in the spec
- # examples):
- #
- #if folded and line_break == '\n':
- # if not breaks:
- # if self.peek() not in ' \t':
- # chunks.append(' ')
- # else:
- # chunks.append(line_break)
- #else:
- # chunks.append(line_break)
- else:
- break
- # Chomp the tail.
- if chomping is not False:
- chunks.append(line_break)
- if chomping is True:
- chunks.extend(breaks)
- # We are done.
- return ScalarToken(''.join(chunks), False, start_mark, end_mark,
- style)
- def scan_block_scalar_indicators(self, start_mark):
- # See the specification for details.
- chomping = None
- increment = None
- ch = self.peek()
- if ch in '+-':
- if ch == '+':
- chomping = True
- else:
- chomping = False
- self.forward()
- ch = self.peek()
- if ch in '0123456789':
- increment = int(ch)
- if increment == 0:
- raise ScannerError("while scanning a block scalar", start_mark,
- "expected indentation indicator in the range 1-9, but found 0",
- self.get_mark())
- self.forward()
- elif ch in '0123456789':
- increment = int(ch)
- if increment == 0:
- raise ScannerError("while scanning a block scalar", start_mark,
- "expected indentation indicator in the range 1-9, but found 0",
- self.get_mark())
- self.forward()
- ch = self.peek()
- if ch in '+-':
- if ch == '+':
- chomping = True
- else:
- chomping = False
- self.forward()
- ch = self.peek()
- if ch not in '\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a block scalar", start_mark,
- "expected chomping or indentation indicators, but found %r"
- % ch, self.get_mark())
- return chomping, increment
- def scan_block_scalar_ignored_line(self, start_mark):
- # See the specification for details.
- while self.peek() == ' ':
- self.forward()
- if self.peek() == '#':
- while self.peek() not in '\0\r\n\x85\u2028\u2029':
- self.forward()
- ch = self.peek()
- if ch not in '\0\r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a block scalar", start_mark,
- "expected a comment or a line break, but found %r" % ch,
- self.get_mark())
- self.scan_line_break()
- def scan_block_scalar_indentation(self):
- # See the specification for details.
- chunks = []
- max_indent = 0
- end_mark = self.get_mark()
- while self.peek() in ' \r\n\x85\u2028\u2029':
- if self.peek() != ' ':
- chunks.append(self.scan_line_break())
- end_mark = self.get_mark()
- else:
- self.forward()
- if self.column > max_indent:
- max_indent = self.column
- return chunks, max_indent, end_mark
- def scan_block_scalar_breaks(self, indent):
- # See the specification for details.
- chunks = []
- end_mark = self.get_mark()
- while self.column < indent and self.peek() == ' ':
- self.forward()
- while self.peek() in '\r\n\x85\u2028\u2029':
- chunks.append(self.scan_line_break())
- end_mark = self.get_mark()
- while self.column < indent and self.peek() == ' ':
- self.forward()
- return chunks, end_mark
- def scan_flow_scalar(self, style):
- # See the specification for details.
- # Note that we loose indentation rules for quoted scalars. Quoted
- # scalars don't need to adhere indentation because " and ' clearly
- # mark the beginning and the end of them. Therefore we are less
- # restrictive then the specification requires. We only need to check
- # that document separators are not included in scalars.
- if style == '"':
- double = True
- else:
- double = False
- chunks = []
- start_mark = self.get_mark()
- quote = self.peek()
- self.forward()
- chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
- while self.peek() != quote:
- chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
- chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
- self.forward()
- end_mark = self.get_mark()
- return ScalarToken(''.join(chunks), False, start_mark, end_mark,
- style)
- ESCAPE_REPLACEMENTS = {
- '0': '\0',
- 'a': '\x07',
- 'b': '\x08',
- 't': '\x09',
- '\t': '\x09',
- 'n': '\x0A',
- 'v': '\x0B',
- 'f': '\x0C',
- 'r': '\x0D',
- 'e': '\x1B',
- ' ': '\x20',
- '\"': '\"',
- '\\': '\\',
- '/': '/',
- 'N': '\x85',
- '_': '\xA0',
- 'L': '\u2028',
- 'P': '\u2029',
- }
- ESCAPE_CODES = {
- 'x': 2,
- 'u': 4,
- 'U': 8,
- }
- def scan_flow_scalar_non_spaces(self, double, start_mark):
- # See the specification for details.
- chunks = []
- while True:
- length = 0
- while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029':
- length += 1
- if length:
- chunks.append(self.prefix(length))
- self.forward(length)
- ch = self.peek()
- if not double and ch == '\'' and self.peek(1) == '\'':
- chunks.append('\'')
- self.forward(2)
- elif (double and ch == '\'') or (not double and ch in '\"\\'):
- chunks.append(ch)
- self.forward()
- elif double and ch == '\\':
- self.forward()
- ch = self.peek()
- if ch in self.ESCAPE_REPLACEMENTS:
- chunks.append(self.ESCAPE_REPLACEMENTS[ch])
- self.forward()
- elif ch in self.ESCAPE_CODES:
- length = self.ESCAPE_CODES[ch]
- self.forward()
- for k in range(length):
- if self.peek(k) not in '0123456789ABCDEFabcdef':
- raise ScannerError("while scanning a double-quoted scalar", start_mark,
- "expected escape sequence of %d hexdecimal numbers, but found %r" %
- (length, self.peek(k)), self.get_mark())
- code = int(self.prefix(length), 16)
- chunks.append(chr(code))
- self.forward(length)
- elif ch in '\r\n\x85\u2028\u2029':
- self.scan_line_break()
- chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
- else:
- raise ScannerError("while scanning a double-quoted scalar", start_mark,
- "found unknown escape character %r" % ch, self.get_mark())
- else:
- return chunks
- def scan_flow_scalar_spaces(self, double, start_mark):
- # See the specification for details.
- chunks = []
- length = 0
- while self.peek(length) in ' \t':
- length += 1
- whitespaces = self.prefix(length)
- self.forward(length)
- ch = self.peek()
- if ch == '\0':
- raise ScannerError("while scanning a quoted scalar", start_mark,
- "found unexpected end of stream", self.get_mark())
- elif ch in '\r\n\x85\u2028\u2029':
- line_break = self.scan_line_break()
- breaks = self.scan_flow_scalar_breaks(double, start_mark)
- if line_break != '\n':
- chunks.append(line_break)
- elif not breaks:
- chunks.append(' ')
- chunks.extend(breaks)
- else:
- chunks.append(whitespaces)
- return chunks
- def scan_flow_scalar_breaks(self, double, start_mark):
- # See the specification for details.
- chunks = []
- while True:
- # Instead of checking indentation, we check for document
- # separators.
- prefix = self.prefix(3)
- if (prefix == '---' or prefix == '...') \
- and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a quoted scalar", start_mark,
- "found unexpected document separator", self.get_mark())
- while self.peek() in ' \t':
- self.forward()
- if self.peek() in '\r\n\x85\u2028\u2029':
- chunks.append(self.scan_line_break())
- else:
- return chunks
- def scan_plain(self):
- # See the specification for details.
- # We add an additional restriction for the flow context:
- # plain scalars in the flow context cannot contain ',' or '?'.
- # We also keep track of the `allow_simple_key` flag here.
- # Indentation rules are loosed for the flow context.
- chunks = []
- start_mark = self.get_mark()
- end_mark = start_mark
- indent = self.indent+1
- # We allow zero indentation for scalars, but then we need to check for
- # document separators at the beginning of the line.
- #if indent == 0:
- # indent = 1
- spaces = []
- while True:
- length = 0
- if self.peek() == '#':
- break
- while True:
- ch = self.peek(length)
- if ch in '\0 \t\r\n\x85\u2028\u2029' \
- or (ch == ':' and
- self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029'
- + (u',[]{}' if self.flow_level else u''))\
- or (self.flow_level and ch in ',?[]{}'):
- break
- length += 1
- if length == 0:
- break
- self.allow_simple_key = False
- chunks.extend(spaces)
- chunks.append(self.prefix(length))
- self.forward(length)
- end_mark = self.get_mark()
- spaces = self.scan_plain_spaces(indent, start_mark)
- if not spaces or self.peek() == '#' \
- or (not self.flow_level and self.column < indent):
- break
- return ScalarToken(''.join(chunks), True, start_mark, end_mark)
- def scan_plain_spaces(self, indent, start_mark):
- # See the specification for details.
- # The specification is really confusing about tabs in plain scalars.
- # We just forbid them completely. Do not use tabs in YAML!
- chunks = []
- length = 0
- while self.peek(length) in ' ':
- length += 1
- whitespaces = self.prefix(length)
- self.forward(length)
- ch = self.peek()
- if ch in '\r\n\x85\u2028\u2029':
- line_break = self.scan_line_break()
- self.allow_simple_key = True
- prefix = self.prefix(3)
- if (prefix == '---' or prefix == '...') \
- and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
- return
- breaks = []
- while self.peek() in ' \r\n\x85\u2028\u2029':
- if self.peek() == ' ':
- self.forward()
- else:
- breaks.append(self.scan_line_break())
- prefix = self.prefix(3)
- if (prefix == '---' or prefix == '...') \
- and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
- return
- if line_break != '\n':
- chunks.append(line_break)
- elif not breaks:
- chunks.append(' ')
- chunks.extend(breaks)
- elif whitespaces:
- chunks.append(whitespaces)
- return chunks
- def scan_tag_handle(self, name, start_mark):
- # See the specification for details.
- # For some strange reasons, the specification does not allow '_' in
- # tag handles. I have allowed it anyway.
- ch = self.peek()
- if ch != '!':
- raise ScannerError("while scanning a %s" % name, start_mark,
- "expected '!', but found %r" % ch, self.get_mark())
- length = 1
- ch = self.peek(length)
- if ch != ' ':
- while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
- or ch in '-_':
- length += 1
- ch = self.peek(length)
- if ch != '!':
- self.forward(length)
- raise ScannerError("while scanning a %s" % name, start_mark,
- "expected '!', but found %r" % ch, self.get_mark())
- length += 1
- value = self.prefix(length)
- self.forward(length)
- return value
- def scan_tag_uri(self, name, start_mark):
- # See the specification for details.
- # Note: we do not check if URI is well-formed.
- chunks = []
- length = 0
- ch = self.peek(length)
- while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
- or ch in '-;/?:@&=+$,_.!~*\'()[]%':
- if ch == '%':
- chunks.append(self.prefix(length))
- self.forward(length)
- length = 0
- chunks.append(self.scan_uri_escapes(name, start_mark))
- else:
- length += 1
- ch = self.peek(length)
- if length:
- chunks.append(self.prefix(length))
- self.forward(length)
- length = 0
- if not chunks:
- raise ScannerError("while parsing a %s" % name, start_mark,
- "expected URI, but found %r" % ch, self.get_mark())
- return ''.join(chunks)
- def scan_uri_escapes(self, name, start_mark):
- # See the specification for details.
- codes = []
- mark = self.get_mark()
- while self.peek() == '%':
- self.forward()
- for k in range(2):
- if self.peek(k) not in '0123456789ABCDEFabcdef':
- raise ScannerError("while scanning a %s" % name, start_mark,
- "expected URI escape sequence of 2 hexdecimal numbers, but found %r"
- % self.peek(k), self.get_mark())
- codes.append(int(self.prefix(2), 16))
- self.forward(2)
- try:
- value = bytes(codes).decode('utf-8')
- except UnicodeDecodeError as exc:
- raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
- return value
- def scan_line_break(self):
- # Transforms:
- # '\r\n' : '\n'
- # '\r' : '\n'
- # '\n' : '\n'
- # '\x85' : '\n'
- # '\u2028' : '\u2028'
- # '\u2029 : '\u2029'
- # default : ''
- ch = self.peek()
- if ch in '\r\n\x85':
- if self.prefix(2) == '\r\n':
- self.forward(2)
- else:
- self.forward()
- return '\n'
- elif ch in '\u2028\u2029':
- self.forward()
- return ch
- return ''
|