123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449 |
- # Scanner produces tokens of the following types:
- # STREAM-START
- # STREAM-END
- # DIRECTIVE(name, value)
- # DOCUMENT-START
- # DOCUMENT-END
- # BLOCK-SEQUENCE-START
- # BLOCK-MAPPING-START
- # BLOCK-END
- # FLOW-SEQUENCE-START
- # FLOW-MAPPING-START
- # FLOW-SEQUENCE-END
- # FLOW-MAPPING-END
- # BLOCK-ENTRY
- # FLOW-ENTRY
- # KEY
- # VALUE
- # ALIAS(value)
- # ANCHOR(value)
- # TAG(value)
- # SCALAR(value, plain, style)
- #
- # Read comments in the Scanner code for more details.
- #
- __all__ = ['Scanner', 'ScannerError']
- import sys
- from error import MarkedYAMLError
- from tokens import *
- class ScannerError(MarkedYAMLError):
- pass
- class SimpleKey(object):
- # See below simple keys treatment.
- def __init__(self, token_number, required, index, line, column, mark):
- self.token_number = token_number
- self.required = required
- self.index = index
- self.line = line
- self.column = column
- self.mark = mark
- class Scanner(object):
- def __init__(self):
- """Initialize the scanner."""
- # It is assumed that Scanner and Reader will have a common descendant.
- # Reader do the dirty work of checking for BOM and converting the
- # input data to Unicode. It also adds NUL to the end.
- #
- # Reader supports the following methods
- # self.peek(i=0) # peek the next i-th character
- # self.prefix(l=1) # peek the next l characters
- # self.forward(l=1) # read the next l characters and move the pointer.
- # Had we reached the end of the stream?
- self.done = False
- # The number of unclosed '{' and '['. `flow_level == 0` means block
- # context.
- self.flow_level = 0
- # List of processed tokens that are not yet emitted.
- self.tokens = []
- # Add the STREAM-START token.
- self.fetch_stream_start()
- # Number of tokens that were emitted through the `get_token` method.
- self.tokens_taken = 0
- # The current indentation level.
- self.indent = -1
- # Past indentation levels.
- self.indents = []
- # Variables related to simple keys treatment.
- # A simple key is a key that is not denoted by the '?' indicator.
- # Example of simple keys:
- # ---
- # block simple key: value
- # ? not a simple key:
- # : { flow simple key: value }
- # We emit the KEY token before all keys, so when we find a potential
- # simple key, we try to locate the corresponding ':' indicator.
- # Simple keys should be limited to a single line and 1024 characters.
- # Can a simple key start at the current position? A simple key may
- # start:
- # - at the beginning of the line, not counting indentation spaces
- # (in block context),
- # - after '{', '[', ',' (in the flow context),
- # - after '?', ':', '-' (in the block context).
- # In the block context, this flag also signifies if a block collection
- # may start at the current position.
- self.allow_simple_key = True
- # Keep track of possible simple keys. This is a dictionary. The key
- # is `flow_level`; there can be no more that one possible simple key
- # for each level. The value is a SimpleKey record:
- # (token_number, required, index, line, column, mark)
- # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
- # '[', or '{' tokens.
- self.possible_simple_keys = {}
- # Public methods.
- def check_token(self, *choices):
- # Check if the next token is one of the given types.
- while self.need_more_tokens():
- self.fetch_more_tokens()
- if self.tokens:
- if not choices:
- return True
- for choice in choices:
- if isinstance(self.tokens[0], choice):
- return True
- return False
- def peek_token(self):
- # Return the next token, but do not delete if from the queue.
- # Return None if no more tokens.
- while self.need_more_tokens():
- self.fetch_more_tokens()
- if self.tokens:
- return self.tokens[0]
- else:
- return None
- def get_token(self):
- # Return the next token.
- while self.need_more_tokens():
- self.fetch_more_tokens()
- if self.tokens:
- self.tokens_taken += 1
- return self.tokens.pop(0)
- # Private methods.
- def need_more_tokens(self):
- if self.done:
- return False
- if not self.tokens:
- return True
- # The current token may be a potential simple key, so we
- # need to look further.
- self.stale_possible_simple_keys()
- if self.next_possible_simple_key() == self.tokens_taken:
- return True
- def fetch_more_tokens(self):
- # Eat whitespaces and comments until we reach the next token.
- self.scan_to_next_token()
- # Remove obsolete possible simple keys.
- self.stale_possible_simple_keys()
- # Compare the current indentation and column. It may add some tokens
- # and decrease the current indentation level.
- self.unwind_indent(self.column)
- # Peek the next character.
- ch = self.peek()
- # Is it the end of stream?
- if ch == u'\0':
- return self.fetch_stream_end()
- # Is it a directive?
- if ch == u'%' and self.check_directive():
- return self.fetch_directive()
- # Is it the document start?
- if ch == u'-' and self.check_document_start():
- return self.fetch_document_start()
- # Is it the document end?
- if ch == u'.' and self.check_document_end():
- return self.fetch_document_end()
- # TODO: support for BOM within a stream.
- #if ch == u'\uFEFF':
- # return self.fetch_bom() <-- issue BOMToken
- # Note: the order of the following checks is NOT significant.
- # Is it the flow sequence start indicator?
- if ch == u'[':
- return self.fetch_flow_sequence_start()
- # Is it the flow mapping start indicator?
- if ch == u'{':
- return self.fetch_flow_mapping_start()
- # Is it the flow sequence end indicator?
- if ch == u']':
- return self.fetch_flow_sequence_end()
- # Is it the flow mapping end indicator?
- if ch == u'}':
- return self.fetch_flow_mapping_end()
- # Is it the flow entry indicator?
- if ch == u',':
- return self.fetch_flow_entry()
- # Is it the block entry indicator?
- if ch == u'-' and self.check_block_entry():
- return self.fetch_block_entry()
- # Is it the key indicator?
- if ch == u'?' and self.check_key():
- return self.fetch_key()
- # Is it the value indicator?
- if ch == u':' and self.check_value():
- return self.fetch_value()
- # Is it an alias?
- if ch == u'*':
- return self.fetch_alias()
- # Is it an anchor?
- if ch == u'&':
- return self.fetch_anchor()
- # Is it a tag?
- if ch == u'!':
- return self.fetch_tag()
- # Is it a literal scalar?
- if ch == u'|' and not self.flow_level:
- return self.fetch_literal()
- # Is it a folded scalar?
- if ch == u'>' and not self.flow_level:
- return self.fetch_folded()
- # Is it a single quoted scalar?
- if ch == u'\'':
- return self.fetch_single()
- # Is it a double quoted scalar?
- if ch == u'\"':
- return self.fetch_double()
- # It must be a plain scalar then.
- if self.check_plain():
- return self.fetch_plain()
- # No? It's an error. Let's produce a nice error message.
- raise ScannerError("while scanning for the next token", None,
- "found character %r that cannot start any token"
- % ch.encode('utf-8'), self.get_mark())
- # Simple keys treatment.
- def next_possible_simple_key(self):
- # Return the number of the nearest possible simple key. Actually we
- # don't need to loop through the whole dictionary. We may replace it
- # with the following code:
- # if not self.possible_simple_keys:
- # return None
- # return self.possible_simple_keys[
- # min(self.possible_simple_keys.keys())].token_number
- min_token_number = None
- for level in self.possible_simple_keys:
- key = self.possible_simple_keys[level]
- if min_token_number is None or key.token_number < min_token_number:
- min_token_number = key.token_number
- return min_token_number
- def stale_possible_simple_keys(self):
- # Remove entries that are no longer possible simple keys. According to
- # the YAML specification, simple keys
- # - should be limited to a single line,
- # - should be no longer than 1024 characters.
- # Disabling this procedure will allow simple keys of any length and
- # height (may cause problems if indentation is broken though).
- for level in self.possible_simple_keys.keys():
- key = self.possible_simple_keys[level]
- if key.line != self.line \
- or self.index-key.index > 1024:
- if key.required:
- raise ScannerError("while scanning a simple key", key.mark,
- "could not find expected ':'", self.get_mark())
- del self.possible_simple_keys[level]
- def save_possible_simple_key(self):
- # The next token may start a simple key. We check if it's possible
- # and save its position. This function is called for
- # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
- # Check if a simple key is required at the current position.
- required = not self.flow_level and self.indent == self.column
- # The next token might be a simple key. Let's save it's number and
- # position.
- if self.allow_simple_key:
- self.remove_possible_simple_key()
- token_number = self.tokens_taken+len(self.tokens)
- key = SimpleKey(token_number, required,
- self.index, self.line, self.column, self.get_mark())
- self.possible_simple_keys[self.flow_level] = key
- def remove_possible_simple_key(self):
- # Remove the saved possible key position at the current flow level.
- if self.flow_level in self.possible_simple_keys:
- key = self.possible_simple_keys[self.flow_level]
-
- if key.required:
- raise ScannerError("while scanning a simple key", key.mark,
- "could not find expected ':'", self.get_mark())
- del self.possible_simple_keys[self.flow_level]
- # Indentation functions.
- def unwind_indent(self, column):
- ## In flow context, tokens should respect indentation.
- ## Actually the condition should be `self.indent >= column` according to
- ## the spec. But this condition will prohibit intuitively correct
- ## constructions such as
- ## key : {
- ## }
- #if self.flow_level and self.indent > column:
- # raise ScannerError(None, None,
- # "invalid indentation or unclosed '[' or '{'",
- # self.get_mark())
- # In the flow context, indentation is ignored. We make the scanner less
- # restrictive then specification requires.
- if self.flow_level:
- return
- # In block context, we may need to issue the BLOCK-END tokens.
- while self.indent > column:
- mark = self.get_mark()
- self.indent = self.indents.pop()
- self.tokens.append(BlockEndToken(mark, mark))
- def add_indent(self, column):
- # Check if we need to increase indentation.
- if self.indent < column:
- self.indents.append(self.indent)
- self.indent = column
- return True
- return False
- # Fetchers.
- def fetch_stream_start(self):
- # We always add STREAM-START as the first token and STREAM-END as the
- # last token.
- # Read the token.
- mark = self.get_mark()
-
- # Add STREAM-START.
- self.tokens.append(StreamStartToken(mark, mark,
- encoding=self.encoding))
-
- def fetch_stream_end(self):
- # Set the current indentation to -1.
- self.unwind_indent(-1)
- # Reset simple keys.
- self.remove_possible_simple_key()
- self.allow_simple_key = False
- self.possible_simple_keys = {}
- # Read the token.
- mark = self.get_mark()
-
- # Add STREAM-END.
- self.tokens.append(StreamEndToken(mark, mark))
- # The steam is finished.
- self.done = True
- def fetch_directive(self):
-
- # Set the current indentation to -1.
- self.unwind_indent(-1)
- # Reset simple keys.
- self.remove_possible_simple_key()
- self.allow_simple_key = False
- # Scan and add DIRECTIVE.
- self.tokens.append(self.scan_directive())
- def fetch_document_start(self):
- self.fetch_document_indicator(DocumentStartToken)
- def fetch_document_end(self):
- self.fetch_document_indicator(DocumentEndToken)
- def fetch_document_indicator(self, TokenClass):
- # Set the current indentation to -1.
- self.unwind_indent(-1)
- # Reset simple keys. Note that there could not be a block collection
- # after '---'.
- self.remove_possible_simple_key()
- self.allow_simple_key = False
- # Add DOCUMENT-START or DOCUMENT-END.
- start_mark = self.get_mark()
- self.forward(3)
- end_mark = self.get_mark()
- self.tokens.append(TokenClass(start_mark, end_mark))
- def fetch_flow_sequence_start(self):
- self.fetch_flow_collection_start(FlowSequenceStartToken)
- def fetch_flow_mapping_start(self):
- self.fetch_flow_collection_start(FlowMappingStartToken)
- def fetch_flow_collection_start(self, TokenClass):
- # '[' and '{' may start a simple key.
- self.save_possible_simple_key()
- # Increase the flow level.
- self.flow_level += 1
- # Simple keys are allowed after '[' and '{'.
- self.allow_simple_key = True
- # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(TokenClass(start_mark, end_mark))
- def fetch_flow_sequence_end(self):
- self.fetch_flow_collection_end(FlowSequenceEndToken)
- def fetch_flow_mapping_end(self):
- self.fetch_flow_collection_end(FlowMappingEndToken)
- def fetch_flow_collection_end(self, TokenClass):
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Decrease the flow level.
- self.flow_level -= 1
- # No simple keys after ']' or '}'.
- self.allow_simple_key = False
- # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(TokenClass(start_mark, end_mark))
- def fetch_flow_entry(self):
- # Simple keys are allowed after ','.
- self.allow_simple_key = True
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Add FLOW-ENTRY.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(FlowEntryToken(start_mark, end_mark))
- def fetch_block_entry(self):
- # Block context needs additional checks.
- if not self.flow_level:
- # Are we allowed to start a new entry?
- if not self.allow_simple_key:
- raise ScannerError(None, None,
- "sequence entries are not allowed here",
- self.get_mark())
- # We may need to add BLOCK-SEQUENCE-START.
- if self.add_indent(self.column):
- mark = self.get_mark()
- self.tokens.append(BlockSequenceStartToken(mark, mark))
- # It's an error for the block entry to occur in the flow context,
- # but we let the parser detect this.
- else:
- pass
- # Simple keys are allowed after '-'.
- self.allow_simple_key = True
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Add BLOCK-ENTRY.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(BlockEntryToken(start_mark, end_mark))
- def fetch_key(self):
-
- # Block context needs additional checks.
- if not self.flow_level:
- # Are we allowed to start a key (not necessary a simple)?
- if not self.allow_simple_key:
- raise ScannerError(None, None,
- "mapping keys are not allowed here",
- self.get_mark())
- # We may need to add BLOCK-MAPPING-START.
- if self.add_indent(self.column):
- mark = self.get_mark()
- self.tokens.append(BlockMappingStartToken(mark, mark))
- # Simple keys are allowed after '?' in the block context.
- self.allow_simple_key = not self.flow_level
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Add KEY.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(KeyToken(start_mark, end_mark))
- def fetch_value(self):
- # Do we determine a simple key?
- if self.flow_level in self.possible_simple_keys:
- # Add KEY.
- key = self.possible_simple_keys[self.flow_level]
- del self.possible_simple_keys[self.flow_level]
- self.tokens.insert(key.token_number-self.tokens_taken,
- KeyToken(key.mark, key.mark))
- # If this key starts a new block mapping, we need to add
- # BLOCK-MAPPING-START.
- if not self.flow_level:
- if self.add_indent(key.column):
- self.tokens.insert(key.token_number-self.tokens_taken,
- BlockMappingStartToken(key.mark, key.mark))
- # There cannot be two simple keys one after another.
- self.allow_simple_key = False
- # It must be a part of a complex key.
- else:
-
- # Block context needs additional checks.
- # (Do we really need them? They will be caught by the parser
- # anyway.)
- if not self.flow_level:
- # We are allowed to start a complex value if and only if
- # we can start a simple key.
- if not self.allow_simple_key:
- raise ScannerError(None, None,
- "mapping values are not allowed here",
- self.get_mark())
- # If this value starts a new block mapping, we need to add
- # BLOCK-MAPPING-START. It will be detected as an error later by
- # the parser.
- if not self.flow_level:
- if self.add_indent(self.column):
- mark = self.get_mark()
- self.tokens.append(BlockMappingStartToken(mark, mark))
- # Simple keys are allowed after ':' in the block context.
- self.allow_simple_key = not self.flow_level
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Add VALUE.
- start_mark = self.get_mark()
- self.forward()
- end_mark = self.get_mark()
- self.tokens.append(ValueToken(start_mark, end_mark))
- def fetch_alias(self):
- # ALIAS could be a simple key.
- self.save_possible_simple_key()
- # No simple keys after ALIAS.
- self.allow_simple_key = False
- # Scan and add ALIAS.
- self.tokens.append(self.scan_anchor(AliasToken))
- def fetch_anchor(self):
- # ANCHOR could start a simple key.
- self.save_possible_simple_key()
- # No simple keys after ANCHOR.
- self.allow_simple_key = False
- # Scan and add ANCHOR.
- self.tokens.append(self.scan_anchor(AnchorToken))
- def fetch_tag(self):
- # TAG could start a simple key.
- self.save_possible_simple_key()
- # No simple keys after TAG.
- self.allow_simple_key = False
- # Scan and add TAG.
- self.tokens.append(self.scan_tag())
- def fetch_literal(self):
- self.fetch_block_scalar(style='|')
- def fetch_folded(self):
- self.fetch_block_scalar(style='>')
- def fetch_block_scalar(self, style):
- # A simple key may follow a block scalar.
- self.allow_simple_key = True
- # Reset possible simple key on the current level.
- self.remove_possible_simple_key()
- # Scan and add SCALAR.
- self.tokens.append(self.scan_block_scalar(style))
- def fetch_single(self):
- self.fetch_flow_scalar(style='\'')
- def fetch_double(self):
- self.fetch_flow_scalar(style='"')
- def fetch_flow_scalar(self, style):
- # A flow scalar could be a simple key.
- self.save_possible_simple_key()
- # No simple keys after flow scalars.
- self.allow_simple_key = False
- # Scan and add SCALAR.
- self.tokens.append(self.scan_flow_scalar(style))
- def fetch_plain(self):
- # A plain scalar could be a simple key.
- self.save_possible_simple_key()
- # No simple keys after plain scalars. But note that `scan_plain` will
- # change this flag if the scan is finished at the beginning of the
- # line.
- self.allow_simple_key = False
- # Scan and add SCALAR. May change `allow_simple_key`.
- self.tokens.append(self.scan_plain())
- # Checkers.
- def check_directive(self):
- # DIRECTIVE: ^ '%' ...
- # The '%' indicator is already checked.
- if self.column == 0:
- return True
- def check_document_start(self):
- # DOCUMENT-START: ^ '---' (' '|'\n')
- if self.column == 0:
- if self.prefix(3) == u'---' \
- and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
- return True
- def check_document_end(self):
- # DOCUMENT-END: ^ '...' (' '|'\n')
- if self.column == 0:
- if self.prefix(3) == u'...' \
- and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
- return True
- def check_block_entry(self):
- # BLOCK-ENTRY: '-' (' '|'\n')
- return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
- def check_key(self):
- # KEY(flow context): '?'
- if self.flow_level:
- return True
- # KEY(block context): '?' (' '|'\n')
- else:
- return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
- def check_value(self):
- # VALUE(flow context): ':'
- if self.flow_level:
- return True
- # VALUE(block context): ':' (' '|'\n')
- else:
- return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
- def check_plain(self):
- # A plain scalar may start with any non-space character except:
- # '-', '?', ':', ',', '[', ']', '{', '}',
- # '#', '&', '*', '!', '|', '>', '\'', '\"',
- # '%', '@', '`'.
- #
- # It may also start with
- # '-', '?', ':'
- # if it is followed by a non-space character.
- #
- # Note that we limit the last rule to the block context (except the
- # '-' character) because we want the flow context to be space
- # independent.
- ch = self.peek()
- return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
- or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
- and (ch == u'-' or (not self.flow_level and ch in u'?:')))
- # Scanners.
- def scan_to_next_token(self):
- # We ignore spaces, line breaks and comments.
- # If we find a line break in the block context, we set the flag
- # `allow_simple_key` on.
- # The byte order mark is stripped if it's the first character in the
- # stream. We do not yet support BOM inside the stream as the
- # specification requires. Any such mark will be considered as a part
- # of the document.
- #
- # TODO: We need to make tab handling rules more sane. A good rule is
- # Tabs cannot precede tokens
- # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
- # KEY(block), VALUE(block), BLOCK-ENTRY
- # So the checking code is
- # if <TAB>:
- # self.allow_simple_keys = False
- # We also need to add the check for `allow_simple_keys == True` to
- # `unwind_indent` before issuing BLOCK-END.
- # Scanners for block, flow, and plain scalars need to be modified.
- if self.index == 0 and self.peek() == u'\uFEFF':
- self.forward()
- found = False
- while not found:
- while self.peek() == u' ':
- self.forward()
- if self.peek() == u'#':
- while self.peek() not in u'\0\r\n\x85\u2028\u2029':
- self.forward()
- if self.scan_line_break():
- if not self.flow_level:
- self.allow_simple_key = True
- else:
- found = True
- def scan_directive(self):
- # See the specification for details.
- start_mark = self.get_mark()
- self.forward()
- name = self.scan_directive_name(start_mark)
- value = None
- if name == u'YAML':
- value = self.scan_yaml_directive_value(start_mark)
- end_mark = self.get_mark()
- elif name == u'TAG':
- value = self.scan_tag_directive_value(start_mark)
- end_mark = self.get_mark()
- else:
- end_mark = self.get_mark()
- while self.peek() not in u'\0\r\n\x85\u2028\u2029':
- self.forward()
- self.scan_directive_ignored_line(start_mark)
- return DirectiveToken(name, value, start_mark, end_mark)
- def scan_directive_name(self, start_mark):
- # See the specification for details.
- length = 0
- ch = self.peek(length)
- while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
- or ch in u'-_':
- length += 1
- ch = self.peek(length)
- if not length:
- raise ScannerError("while scanning a directive", start_mark,
- "expected alphabetic or numeric character, but found %r"
- % ch.encode('utf-8'), self.get_mark())
- value = self.prefix(length)
- self.forward(length)
- ch = self.peek()
- if ch not in u'\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a directive", start_mark,
- "expected alphabetic or numeric character, but found %r"
- % ch.encode('utf-8'), self.get_mark())
- return value
- def scan_yaml_directive_value(self, start_mark):
- # See the specification for details.
- while self.peek() == u' ':
- self.forward()
- major = self.scan_yaml_directive_number(start_mark)
- if self.peek() != '.':
- raise ScannerError("while scanning a directive", start_mark,
- "expected a digit or '.', but found %r"
- % self.peek().encode('utf-8'),
- self.get_mark())
- self.forward()
- minor = self.scan_yaml_directive_number(start_mark)
- if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a directive", start_mark,
- "expected a digit or ' ', but found %r"
- % self.peek().encode('utf-8'),
- self.get_mark())
- return (major, minor)
- def scan_yaml_directive_number(self, start_mark):
- # See the specification for details.
- ch = self.peek()
- if not (u'0' <= ch <= u'9'):
- raise ScannerError("while scanning a directive", start_mark,
- "expected a digit, but found %r" % ch.encode('utf-8'),
- self.get_mark())
- length = 0
- while u'0' <= self.peek(length) <= u'9':
- length += 1
- value = int(self.prefix(length))
- self.forward(length)
- return value
- def scan_tag_directive_value(self, start_mark):
- # See the specification for details.
- while self.peek() == u' ':
- self.forward()
- handle = self.scan_tag_directive_handle(start_mark)
- while self.peek() == u' ':
- self.forward()
- prefix = self.scan_tag_directive_prefix(start_mark)
- return (handle, prefix)
- def scan_tag_directive_handle(self, start_mark):
- # See the specification for details.
- value = self.scan_tag_handle('directive', start_mark)
- ch = self.peek()
- if ch != u' ':
- raise ScannerError("while scanning a directive", start_mark,
- "expected ' ', but found %r" % ch.encode('utf-8'),
- self.get_mark())
- return value
- def scan_tag_directive_prefix(self, start_mark):
- # See the specification for details.
- value = self.scan_tag_uri('directive', start_mark)
- ch = self.peek()
- if ch not in u'\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a directive", start_mark,
- "expected ' ', but found %r" % ch.encode('utf-8'),
- self.get_mark())
- return value
- def scan_directive_ignored_line(self, start_mark):
- # See the specification for details.
- while self.peek() == u' ':
- self.forward()
- if self.peek() == u'#':
- while self.peek() not in u'\0\r\n\x85\u2028\u2029':
- self.forward()
- ch = self.peek()
- if ch not in u'\0\r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a directive", start_mark,
- "expected a comment or a line break, but found %r"
- % ch.encode('utf-8'), self.get_mark())
- self.scan_line_break()
- def scan_anchor(self, TokenClass):
- # The specification does not restrict characters for anchors and
- # aliases. This may lead to problems, for instance, the document:
- # [ *alias, value ]
- # can be interpreted in two ways, as
- # [ "value" ]
- # and
- # [ *alias , "value" ]
- # Therefore we restrict aliases to numbers and ASCII letters.
- start_mark = self.get_mark()
- indicator = self.peek()
- if indicator == u'*':
- name = 'alias'
- else:
- name = 'anchor'
- self.forward()
- length = 0
- ch = self.peek(length)
- while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
- or ch in u'-_':
- length += 1
- ch = self.peek(length)
- if not length:
- raise ScannerError("while scanning an %s" % name, start_mark,
- "expected alphabetic or numeric character, but found %r"
- % ch.encode('utf-8'), self.get_mark())
- value = self.prefix(length)
- self.forward(length)
- ch = self.peek()
- if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
- raise ScannerError("while scanning an %s" % name, start_mark,
- "expected alphabetic or numeric character, but found %r"
- % ch.encode('utf-8'), self.get_mark())
- end_mark = self.get_mark()
- return TokenClass(value, start_mark, end_mark)
- def scan_tag(self):
- # See the specification for details.
- start_mark = self.get_mark()
- ch = self.peek(1)
- if ch == u'<':
- handle = None
- self.forward(2)
- suffix = self.scan_tag_uri('tag', start_mark)
- if self.peek() != u'>':
- raise ScannerError("while parsing a tag", start_mark,
- "expected '>', but found %r" % self.peek().encode('utf-8'),
- self.get_mark())
- self.forward()
- elif ch in u'\0 \t\r\n\x85\u2028\u2029':
- handle = None
- suffix = u'!'
- self.forward()
- else:
- length = 1
- use_handle = False
- while ch not in u'\0 \r\n\x85\u2028\u2029':
- if ch == u'!':
- use_handle = True
- break
- length += 1
- ch = self.peek(length)
- handle = u'!'
- if use_handle:
- handle = self.scan_tag_handle('tag', start_mark)
- else:
- handle = u'!'
- self.forward()
- suffix = self.scan_tag_uri('tag', start_mark)
- ch = self.peek()
- if ch not in u'\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a tag", start_mark,
- "expected ' ', but found %r" % ch.encode('utf-8'),
- self.get_mark())
- value = (handle, suffix)
- end_mark = self.get_mark()
- return TagToken(value, start_mark, end_mark)
- def scan_block_scalar(self, style):
- # See the specification for details.
- if style == '>':
- folded = True
- else:
- folded = False
- chunks = []
- start_mark = self.get_mark()
- # Scan the header.
- self.forward()
- chomping, increment = self.scan_block_scalar_indicators(start_mark)
- self.scan_block_scalar_ignored_line(start_mark)
- # Determine the indentation level and go to the first non-empty line.
- min_indent = self.indent+1
- if min_indent < 1:
- min_indent = 1
- if increment is None:
- breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
- indent = max(min_indent, max_indent)
- else:
- indent = min_indent+increment-1
- breaks, end_mark = self.scan_block_scalar_breaks(indent)
- line_break = u''
- # Scan the inner part of the block scalar.
- while self.column == indent and self.peek() != u'\0':
- chunks.extend(breaks)
- leading_non_space = self.peek() not in u' \t'
- length = 0
- while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
- length += 1
- chunks.append(self.prefix(length))
- self.forward(length)
- line_break = self.scan_line_break()
- breaks, end_mark = self.scan_block_scalar_breaks(indent)
- if self.column == indent and self.peek() != u'\0':
- # Unfortunately, folding rules are ambiguous.
- #
- # This is the folding according to the specification:
-
- if folded and line_break == u'\n' \
- and leading_non_space and self.peek() not in u' \t':
- if not breaks:
- chunks.append(u' ')
- else:
- chunks.append(line_break)
-
- # This is Clark Evans's interpretation (also in the spec
- # examples):
- #
- #if folded and line_break == u'\n':
- # if not breaks:
- # if self.peek() not in ' \t':
- # chunks.append(u' ')
- # else:
- # chunks.append(line_break)
- #else:
- # chunks.append(line_break)
- else:
- break
- # Chomp the tail.
- if chomping is not False:
- chunks.append(line_break)
- if chomping is True:
- chunks.extend(breaks)
- # We are done.
- return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
- style)
- def scan_block_scalar_indicators(self, start_mark):
- # See the specification for details.
- chomping = None
- increment = None
- ch = self.peek()
- if ch in u'+-':
- if ch == '+':
- chomping = True
- else:
- chomping = False
- self.forward()
- ch = self.peek()
- if ch in u'0123456789':
- increment = int(ch)
- if increment == 0:
- raise ScannerError("while scanning a block scalar", start_mark,
- "expected indentation indicator in the range 1-9, but found 0",
- self.get_mark())
- self.forward()
- elif ch in u'0123456789':
- increment = int(ch)
- if increment == 0:
- raise ScannerError("while scanning a block scalar", start_mark,
- "expected indentation indicator in the range 1-9, but found 0",
- self.get_mark())
- self.forward()
- ch = self.peek()
- if ch in u'+-':
- if ch == '+':
- chomping = True
- else:
- chomping = False
- self.forward()
- ch = self.peek()
- if ch not in u'\0 \r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a block scalar", start_mark,
- "expected chomping or indentation indicators, but found %r"
- % ch.encode('utf-8'), self.get_mark())
- return chomping, increment
- def scan_block_scalar_ignored_line(self, start_mark):
- # See the specification for details.
- while self.peek() == u' ':
- self.forward()
- if self.peek() == u'#':
- while self.peek() not in u'\0\r\n\x85\u2028\u2029':
- self.forward()
- ch = self.peek()
- if ch not in u'\0\r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a block scalar", start_mark,
- "expected a comment or a line break, but found %r"
- % ch.encode('utf-8'), self.get_mark())
- self.scan_line_break()
- def scan_block_scalar_indentation(self):
- # See the specification for details.
- chunks = []
- max_indent = 0
- end_mark = self.get_mark()
- while self.peek() in u' \r\n\x85\u2028\u2029':
- if self.peek() != u' ':
- chunks.append(self.scan_line_break())
- end_mark = self.get_mark()
- else:
- self.forward()
- if self.column > max_indent:
- max_indent = self.column
- return chunks, max_indent, end_mark
- def scan_block_scalar_breaks(self, indent):
- # See the specification for details.
- chunks = []
- end_mark = self.get_mark()
- while self.column < indent and self.peek() == u' ':
- self.forward()
- while self.peek() in u'\r\n\x85\u2028\u2029':
- chunks.append(self.scan_line_break())
- end_mark = self.get_mark()
- while self.column < indent and self.peek() == u' ':
- self.forward()
- return chunks, end_mark
- def scan_flow_scalar(self, style):
- # See the specification for details.
- # Note that we loose indentation rules for quoted scalars. Quoted
- # scalars don't need to adhere indentation because " and ' clearly
- # mark the beginning and the end of them. Therefore we are less
- # restrictive then the specification requires. We only need to check
- # that document separators are not included in scalars.
- if style == '"':
- double = True
- else:
- double = False
- chunks = []
- start_mark = self.get_mark()
- quote = self.peek()
- self.forward()
- chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
- while self.peek() != quote:
- chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
- chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
- self.forward()
- end_mark = self.get_mark()
- return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
- style)
- ESCAPE_REPLACEMENTS = {
- u'0': u'\0',
- u'a': u'\x07',
- u'b': u'\x08',
- u't': u'\x09',
- u'\t': u'\x09',
- u'n': u'\x0A',
- u'v': u'\x0B',
- u'f': u'\x0C',
- u'r': u'\x0D',
- u'e': u'\x1B',
- u' ': u'\x20',
- u'\"': u'\"',
- u'\\': u'\\',
- u'/': u'/',
- u'N': u'\x85',
- u'_': u'\xA0',
- u'L': u'\u2028',
- u'P': u'\u2029',
- }
- ESCAPE_CODES = {
- u'x': 2,
- u'u': 4,
- u'U': 8,
- }
- def scan_flow_scalar_non_spaces(self, double, start_mark):
- # See the specification for details.
- chunks = []
- while True:
- length = 0
- while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
- length += 1
- if length:
- chunks.append(self.prefix(length))
- self.forward(length)
- ch = self.peek()
- if not double and ch == u'\'' and self.peek(1) == u'\'':
- chunks.append(u'\'')
- self.forward(2)
- elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
- chunks.append(ch)
- self.forward()
- elif double and ch == u'\\':
- self.forward()
- ch = self.peek()
- if ch in self.ESCAPE_REPLACEMENTS:
- chunks.append(self.ESCAPE_REPLACEMENTS[ch])
- self.forward()
- elif ch in self.ESCAPE_CODES:
- length = self.ESCAPE_CODES[ch]
- self.forward()
- for k in range(length):
- if self.peek(k) not in u'0123456789ABCDEFabcdef':
- raise ScannerError("while scanning a double-quoted scalar", start_mark,
- "expected escape sequence of %d hexdecimal numbers, but found %r" %
- (length, self.peek(k).encode('utf-8')), self.get_mark())
- code = int(self.prefix(length), 16)
- if code <= sys.maxunicode:
- chunks.append(unichr(code))
- else:
- chunks.append(('\\U%08x' % code).decode('unicode-escape'))
- self.forward(length)
- elif ch in u'\r\n\x85\u2028\u2029':
- self.scan_line_break()
- chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
- else:
- raise ScannerError("while scanning a double-quoted scalar", start_mark,
- "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
- else:
- return chunks
- def scan_flow_scalar_spaces(self, double, start_mark):
- # See the specification for details.
- chunks = []
- length = 0
- while self.peek(length) in u' \t':
- length += 1
- whitespaces = self.prefix(length)
- self.forward(length)
- ch = self.peek()
- if ch == u'\0':
- raise ScannerError("while scanning a quoted scalar", start_mark,
- "found unexpected end of stream", self.get_mark())
- elif ch in u'\r\n\x85\u2028\u2029':
- line_break = self.scan_line_break()
- breaks = self.scan_flow_scalar_breaks(double, start_mark)
- if line_break != u'\n':
- chunks.append(line_break)
- elif not breaks:
- chunks.append(u' ')
- chunks.extend(breaks)
- else:
- chunks.append(whitespaces)
- return chunks
- def scan_flow_scalar_breaks(self, double, start_mark):
- # See the specification for details.
- chunks = []
- while True:
- # Instead of checking indentation, we check for document
- # separators.
- prefix = self.prefix(3)
- if (prefix == u'---' or prefix == u'...') \
- and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
- raise ScannerError("while scanning a quoted scalar", start_mark,
- "found unexpected document separator", self.get_mark())
- while self.peek() in u' \t':
- self.forward()
- if self.peek() in u'\r\n\x85\u2028\u2029':
- chunks.append(self.scan_line_break())
- else:
- return chunks
- def scan_plain(self):
- # See the specification for details.
- # We add an additional restriction for the flow context:
- # plain scalars in the flow context cannot contain ',' or '?'.
- # We also keep track of the `allow_simple_key` flag here.
- # Indentation rules are loosed for the flow context.
- chunks = []
- start_mark = self.get_mark()
- end_mark = start_mark
- indent = self.indent+1
- # We allow zero indentation for scalars, but then we need to check for
- # document separators at the beginning of the line.
- #if indent == 0:
- # indent = 1
- spaces = []
- while True:
- length = 0
- if self.peek() == u'#':
- break
- while True:
- ch = self.peek(length)
- if ch in u'\0 \t\r\n\x85\u2028\u2029' \
- or (ch == u':' and
- self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029'
- + (u',[]{}' if self.flow_level else u''))\
- or (self.flow_level and ch in u',?[]{}'):
- break
- length += 1
- if length == 0:
- break
- self.allow_simple_key = False
- chunks.extend(spaces)
- chunks.append(self.prefix(length))
- self.forward(length)
- end_mark = self.get_mark()
- spaces = self.scan_plain_spaces(indent, start_mark)
- if not spaces or self.peek() == u'#' \
- or (not self.flow_level and self.column < indent):
- break
- return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
- def scan_plain_spaces(self, indent, start_mark):
- # See the specification for details.
- # The specification is really confusing about tabs in plain scalars.
- # We just forbid them completely. Do not use tabs in YAML!
- chunks = []
- length = 0
- while self.peek(length) in u' ':
- length += 1
- whitespaces = self.prefix(length)
- self.forward(length)
- ch = self.peek()
- if ch in u'\r\n\x85\u2028\u2029':
- line_break = self.scan_line_break()
- self.allow_simple_key = True
- prefix = self.prefix(3)
- if (prefix == u'---' or prefix == u'...') \
- and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
- return
- breaks = []
- while self.peek() in u' \r\n\x85\u2028\u2029':
- if self.peek() == ' ':
- self.forward()
- else:
- breaks.append(self.scan_line_break())
- prefix = self.prefix(3)
- if (prefix == u'---' or prefix == u'...') \
- and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
- return
- if line_break != u'\n':
- chunks.append(line_break)
- elif not breaks:
- chunks.append(u' ')
- chunks.extend(breaks)
- elif whitespaces:
- chunks.append(whitespaces)
- return chunks
- def scan_tag_handle(self, name, start_mark):
- # See the specification for details.
- # For some strange reasons, the specification does not allow '_' in
- # tag handles. I have allowed it anyway.
- ch = self.peek()
- if ch != u'!':
- raise ScannerError("while scanning a %s" % name, start_mark,
- "expected '!', but found %r" % ch.encode('utf-8'),
- self.get_mark())
- length = 1
- ch = self.peek(length)
- if ch != u' ':
- while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
- or ch in u'-_':
- length += 1
- ch = self.peek(length)
- if ch != u'!':
- self.forward(length)
- raise ScannerError("while scanning a %s" % name, start_mark,
- "expected '!', but found %r" % ch.encode('utf-8'),
- self.get_mark())
- length += 1
- value = self.prefix(length)
- self.forward(length)
- return value
- def scan_tag_uri(self, name, start_mark):
- # See the specification for details.
- # Note: we do not check if URI is well-formed.
- chunks = []
- length = 0
- ch = self.peek(length)
- while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
- or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
- if ch == u'%':
- chunks.append(self.prefix(length))
- self.forward(length)
- length = 0
- chunks.append(self.scan_uri_escapes(name, start_mark))
- else:
- length += 1
- ch = self.peek(length)
- if length:
- chunks.append(self.prefix(length))
- self.forward(length)
- length = 0
- if not chunks:
- raise ScannerError("while parsing a %s" % name, start_mark,
- "expected URI, but found %r" % ch.encode('utf-8'),
- self.get_mark())
- return u''.join(chunks)
- def scan_uri_escapes(self, name, start_mark):
- # See the specification for details.
- bytes = []
- mark = self.get_mark()
- while self.peek() == u'%':
- self.forward()
- for k in range(2):
- if self.peek(k) not in u'0123456789ABCDEFabcdef':
- raise ScannerError("while scanning a %s" % name, start_mark,
- "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
- (self.peek(k).encode('utf-8')), self.get_mark())
- bytes.append(chr(int(self.prefix(2), 16)))
- self.forward(2)
- try:
- value = unicode(''.join(bytes), 'utf-8')
- except UnicodeDecodeError, exc:
- raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
- return value
- def scan_line_break(self):
- # Transforms:
- # '\r\n' : '\n'
- # '\r' : '\n'
- # '\n' : '\n'
- # '\x85' : '\n'
- # '\u2028' : '\u2028'
- # '\u2029 : '\u2029'
- # default : ''
- ch = self.peek()
- if ch in u'\r\n\x85':
- if self.prefix(2) == u'\r\n':
- self.forward(2)
- else:
- self.forward()
- return u'\n'
- elif ch in u'\u2028\u2029':
- self.forward()
- return ch
- return u''
|