| # Scanner produces tokens of the following types: |
| # STREAM-START |
| # STREAM-END |
| # DIRECTIVE(name, value) |
| # DOCUMENT-START |
| # DOCUMENT-END |
| # BLOCK-SEQUENCE-START |
| # BLOCK-MAPPING-START |
| # BLOCK-END |
| # FLOW-SEQUENCE-START |
| # FLOW-MAPPING-START |
| # FLOW-SEQUENCE-END |
| # FLOW-MAPPING-END |
| # BLOCK-ENTRY |
| # FLOW-ENTRY |
| # KEY |
| # VALUE |
| # ALIAS(value) |
| # ANCHOR(value) |
| # TAG(value) |
| # SCALAR(value, plain, style) |
| # |
| # Read comments in the Scanner code for more details. |
| # |
| |
| __all__ = ["Scanner", "ScannerError"] |
| |
| from .error import MarkedYAMLError |
| from .tokens import * |
| |
| |
| class ScannerError(MarkedYAMLError): |
| pass |
| |
| |
| class SimpleKey: |
| # See below simple keys treatment. |
| |
| def __init__(self, token_number, required, index, line, column, mark): |
| self.token_number = token_number |
| self.required = required |
| self.index = index |
| self.line = line |
| self.column = column |
| self.mark = mark |
| |
| |
| class Scanner: |
| def __init__(self): |
| """Initialize the scanner.""" |
| # It is assumed that Scanner and Reader will have a common descendant. |
| # Reader do the dirty work of checking for BOM and converting the |
| # input data to Unicode. It also adds NUL to the end. |
| # |
| # Reader supports the following methods |
| # self.peek(i=0) # peek the next i-th character |
| # self.prefix(l=1) # peek the next l characters |
| # self.forward(l=1) # read the next l characters and move the pointer. |
| |
| # Had we reached the end of the stream? |
| self.done = False |
| |
| # The number of unclosed '{' and '['. `flow_level == 0` means block |
| # context. |
| self.flow_level = 0 |
| |
| # List of processed tokens that are not yet emitted. |
| self.tokens = [] |
| |
| # Add the STREAM-START token. |
| self.fetch_stream_start() |
| |
| # Number of tokens that were emitted through the `get_token` method. |
| self.tokens_taken = 0 |
| |
| # The current indentation level. |
| self.indent = -1 |
| |
| # Past indentation levels. |
| self.indents = [] |
| |
| # Variables related to simple keys treatment. |
| |
| # A simple key is a key that is not denoted by the '?' indicator. |
| # Example of simple keys: |
| # --- |
| # block simple key: value |
| # ? not a simple key: |
| # : { flow simple key: value } |
| # We emit the KEY token before all keys, so when we find a potential |
| # simple key, we try to locate the corresponding ':' indicator. |
| # Simple keys should be limited to a single line and 1024 characters. |
| |
| # Can a simple key start at the current position? A simple key may |
| # start: |
| # - at the beginning of the line, not counting indentation spaces |
| # (in block context), |
| # - after '{', '[', ',' (in the flow context), |
| # - after '?', ':', '-' (in the block context). |
| # In the block context, this flag also signifies if a block collection |
| # may start at the current position. |
| self.allow_simple_key = True |
| |
| # Keep track of possible simple keys. This is a dictionary. The key |
| # is `flow_level`; there can be no more that one possible simple key |
| # for each level. The value is a SimpleKey record: |
| # (token_number, required, index, line, column, mark) |
| # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), |
| # '[', or '{' tokens. |
| self.possible_simple_keys = {} |
| |
| # Public methods. |
| |
| def check_token(self, *choices): |
| # Check if the next token is one of the given types. |
| while self.need_more_tokens(): |
| self.fetch_more_tokens() |
| if self.tokens: |
| if not choices: |
| return True |
| for choice in choices: |
| if isinstance(self.tokens[0], choice): |
| return True |
| return False |
| |
| def peek_token(self): |
| # Return the next token, but do not delete if from the queue. |
| # Return None if no more tokens. |
| while self.need_more_tokens(): |
| self.fetch_more_tokens() |
| if self.tokens: |
| return self.tokens[0] |
| else: |
| return None |
| |
| def get_token(self): |
| # Return the next token. |
| while self.need_more_tokens(): |
| self.fetch_more_tokens() |
| if self.tokens: |
| self.tokens_taken += 1 |
| return self.tokens.pop(0) |
| |
| # Private methods. |
| |
| def need_more_tokens(self): |
| if self.done: |
| return False |
| if not self.tokens: |
| return True |
| # The current token may be a potential simple key, so we |
| # need to look further. |
| self.stale_possible_simple_keys() |
| if self.next_possible_simple_key() == self.tokens_taken: |
| return True |
| |
| def fetch_more_tokens(self): |
| |
| # Eat whitespaces and comments until we reach the next token. |
| self.scan_to_next_token() |
| |
| # Remove obsolete possible simple keys. |
| self.stale_possible_simple_keys() |
| |
| # Compare the current indentation and column. It may add some tokens |
| # and decrease the current indentation level. |
| self.unwind_indent(self.column) |
| |
| # Peek the next character. |
| ch = self.peek() |
| |
| # Is it the end of stream? |
| if ch == "\0": |
| return self.fetch_stream_end() |
| |
| # Is it a directive? |
| if ch == "%" and self.check_directive(): |
| return self.fetch_directive() |
| |
| # Is it the document start? |
| if ch == "-" and self.check_document_start(): |
| return self.fetch_document_start() |
| |
| # Is it the document end? |
| if ch == "." and self.check_document_end(): |
| return self.fetch_document_end() |
| |
| # TODO: support for BOM within a stream. |
| # if ch == '\uFEFF': |
| # return self.fetch_bom() <-- issue BOMToken |
| |
| # Note: the order of the following checks is NOT significant. |
| |
| # Is it the flow sequence start indicator? |
| if ch == "[": |
| return self.fetch_flow_sequence_start() |
| |
| # Is it the flow mapping start indicator? |
| if ch == "{": |
| return self.fetch_flow_mapping_start() |
| |
| # Is it the flow sequence end indicator? |
| if ch == "]": |
| return self.fetch_flow_sequence_end() |
| |
| # Is it the flow mapping end indicator? |
| if ch == "}": |
| return self.fetch_flow_mapping_end() |
| |
| # Is it the flow entry indicator? |
| if ch == ",": |
| return self.fetch_flow_entry() |
| |
| # Is it the block entry indicator? |
| if ch == "-" and self.check_block_entry(): |
| return self.fetch_block_entry() |
| |
| # Is it the key indicator? |
| if ch == "?" and self.check_key(): |
| return self.fetch_key() |
| |
| # Is it the value indicator? |
| if ch == ":" and self.check_value(): |
| return self.fetch_value() |
| |
| # Is it an alias? |
| if ch == "*": |
| return self.fetch_alias() |
| |
| # Is it an anchor? |
| if ch == "&": |
| return self.fetch_anchor() |
| |
| # Is it a tag? |
| if ch == "!": |
| return self.fetch_tag() |
| |
| # Is it a literal scalar? |
| if ch == "|" and not self.flow_level: |
| return self.fetch_literal() |
| |
| # Is it a folded scalar? |
| if ch == ">" and not self.flow_level: |
| return self.fetch_folded() |
| |
| # Is it a single quoted scalar? |
| if ch == "'": |
| return self.fetch_single() |
| |
| # Is it a double quoted scalar? |
| if ch == '"': |
| return self.fetch_double() |
| |
| # It must be a plain scalar then. |
| if self.check_plain(): |
| return self.fetch_plain() |
| |
| # No? It's an error. Let's produce a nice error message. |
| raise ScannerError( |
| "while scanning for the next token", |
| None, |
| "found character %r that cannot start any token" % ch, |
| self.get_mark(), |
| ) |
| |
| # Simple keys treatment. |
| |
| def next_possible_simple_key(self): |
| # Return the number of the nearest possible simple key. Actually we |
| # don't need to loop through the whole dictionary. We may replace it |
| # with the following code: |
| # if not self.possible_simple_keys: |
| # return None |
| # return self.possible_simple_keys[ |
| # min(self.possible_simple_keys.keys())].token_number |
| min_token_number = None |
| for level in self.possible_simple_keys: |
| key = self.possible_simple_keys[level] |
| if min_token_number is None or key.token_number < min_token_number: |
| min_token_number = key.token_number |
| return min_token_number |
| |
| def stale_possible_simple_keys(self): |
| # Remove entries that are no longer possible simple keys. According to |
| # the YAML specification, simple keys |
| # - should be limited to a single line, |
| # - should be no longer than 1024 characters. |
| # Disabling this procedure will allow simple keys of any length and |
| # height (may cause problems if indentation is broken though). |
| for level in list(self.possible_simple_keys): |
| key = self.possible_simple_keys[level] |
| if key.line != self.line or self.index - key.index > 1024: |
| if key.required: |
| raise ScannerError( |
| "while scanning a simple key", |
| key.mark, |
| "could not find expected ':'", |
| self.get_mark(), |
| ) |
| del self.possible_simple_keys[level] |
| |
| def save_possible_simple_key(self): |
| # The next token may start a simple key. We check if it's possible |
| # and save its position. This function is called for |
| # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. |
| |
| # Check if a simple key is required at the current position. |
| required = not self.flow_level and self.indent == self.column |
| |
| # The next token might be a simple key. Let's save it's number and |
| # position. |
| if self.allow_simple_key: |
| self.remove_possible_simple_key() |
| token_number = self.tokens_taken + len(self.tokens) |
| key = SimpleKey( |
| token_number, |
| required, |
| self.index, |
| self.line, |
| self.column, |
| self.get_mark(), |
| ) |
| self.possible_simple_keys[self.flow_level] = key |
| |
| def remove_possible_simple_key(self): |
| # Remove the saved possible key position at the current flow level. |
| if self.flow_level in self.possible_simple_keys: |
| key = self.possible_simple_keys[self.flow_level] |
| |
| if key.required: |
| raise ScannerError( |
| "while scanning a simple key", |
| key.mark, |
| "could not find expected ':'", |
| self.get_mark(), |
| ) |
| |
| del self.possible_simple_keys[self.flow_level] |
| |
| # Indentation functions. |
| |
| def unwind_indent(self, column): |
| |
| ## In flow context, tokens should respect indentation. |
| ## Actually the condition should be `self.indent >= column` according to |
| ## the spec. But this condition will prohibit intuitively correct |
| ## constructions such as |
| ## key : { |
| ## } |
| # if self.flow_level and self.indent > column: |
| # raise ScannerError(None, None, |
| # "invalid indentation or unclosed '[' or '{'", |
| # self.get_mark()) |
| |
| # In the flow context, indentation is ignored. We make the scanner less |
| # restrictive then specification requires. |
| if self.flow_level: |
| return |
| |
| # In block context, we may need to issue the BLOCK-END tokens. |
| while self.indent > column: |
| mark = self.get_mark() |
| self.indent = self.indents.pop() |
| self.tokens.append(BlockEndToken(mark, mark)) |
| |
| def add_indent(self, column): |
| # Check if we need to increase indentation. |
| if self.indent < column: |
| self.indents.append(self.indent) |
| self.indent = column |
| return True |
| return False |
| |
| # Fetchers. |
| |
| def fetch_stream_start(self): |
| # We always add STREAM-START as the first token and STREAM-END as the |
| # last token. |
| |
| # Read the token. |
| mark = self.get_mark() |
| |
| # Add STREAM-START. |
| self.tokens.append(StreamStartToken(mark, mark, encoding=self.encoding)) |
| |
| def fetch_stream_end(self): |
| |
| # Set the current indentation to -1. |
| self.unwind_indent(-1) |
| |
| # Reset simple keys. |
| self.remove_possible_simple_key() |
| self.allow_simple_key = False |
| self.possible_simple_keys = {} |
| |
| # Read the token. |
| mark = self.get_mark() |
| |
| # Add STREAM-END. |
| self.tokens.append(StreamEndToken(mark, mark)) |
| |
| # The steam is finished. |
| self.done = True |
| |
| def fetch_directive(self): |
| |
| # Set the current indentation to -1. |
| self.unwind_indent(-1) |
| |
| # Reset simple keys. |
| self.remove_possible_simple_key() |
| self.allow_simple_key = False |
| |
| # Scan and add DIRECTIVE. |
| self.tokens.append(self.scan_directive()) |
| |
| def fetch_document_start(self): |
| self.fetch_document_indicator(DocumentStartToken) |
| |
| def fetch_document_end(self): |
| self.fetch_document_indicator(DocumentEndToken) |
| |
| def fetch_document_indicator(self, TokenClass): |
| |
| # Set the current indentation to -1. |
| self.unwind_indent(-1) |
| |
| # Reset simple keys. Note that there could not be a block collection |
| # after '---'. |
| self.remove_possible_simple_key() |
| self.allow_simple_key = False |
| |
| # Add DOCUMENT-START or DOCUMENT-END. |
| start_mark = self.get_mark() |
| self.forward(3) |
| end_mark = self.get_mark() |
| self.tokens.append(TokenClass(start_mark, end_mark)) |
| |
| def fetch_flow_sequence_start(self): |
| self.fetch_flow_collection_start(FlowSequenceStartToken) |
| |
| def fetch_flow_mapping_start(self): |
| self.fetch_flow_collection_start(FlowMappingStartToken) |
| |
| def fetch_flow_collection_start(self, TokenClass): |
| |
| # '[' and '{' may start a simple key. |
| self.save_possible_simple_key() |
| |
| # Increase the flow level. |
| self.flow_level += 1 |
| |
| # Simple keys are allowed after '[' and '{'. |
| self.allow_simple_key = True |
| |
| # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. |
| start_mark = self.get_mark() |
| self.forward() |
| end_mark = self.get_mark() |
| self.tokens.append(TokenClass(start_mark, end_mark)) |
| |
| def fetch_flow_sequence_end(self): |
| self.fetch_flow_collection_end(FlowSequenceEndToken) |
| |
| def fetch_flow_mapping_end(self): |
| self.fetch_flow_collection_end(FlowMappingEndToken) |
| |
| def fetch_flow_collection_end(self, TokenClass): |
| |
| # Reset possible simple key on the current level. |
| self.remove_possible_simple_key() |
| |
| # Decrease the flow level. |
| self.flow_level -= 1 |
| |
| # No simple keys after ']' or '}'. |
| self.allow_simple_key = False |
| |
| # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. |
| start_mark = self.get_mark() |
| self.forward() |
| end_mark = self.get_mark() |
| self.tokens.append(TokenClass(start_mark, end_mark)) |
| |
| def fetch_flow_entry(self): |
| |
| # Simple keys are allowed after ','. |
| self.allow_simple_key = True |
| |
| # Reset possible simple key on the current level. |
| self.remove_possible_simple_key() |
| |
| # Add FLOW-ENTRY. |
| start_mark = self.get_mark() |
| self.forward() |
| end_mark = self.get_mark() |
| self.tokens.append(FlowEntryToken(start_mark, end_mark)) |
| |
| def fetch_block_entry(self): |
| |
| # Block context needs additional checks. |
| if not self.flow_level: |
| |
| # Are we allowed to start a new entry? |
| if not self.allow_simple_key: |
| raise ScannerError( |
| None, None, "sequence entries are not allowed here", self.get_mark() |
| ) |
| |
| # We may need to add BLOCK-SEQUENCE-START. |
| if self.add_indent(self.column): |
| mark = self.get_mark() |
| self.tokens.append(BlockSequenceStartToken(mark, mark)) |
| |
| # It's an error for the block entry to occur in the flow context, |
| # but we let the parser detect this. |
| else: |
| pass |
| |
| # Simple keys are allowed after '-'. |
| self.allow_simple_key = True |
| |
| # Reset possible simple key on the current level. |
| self.remove_possible_simple_key() |
| |
| # Add BLOCK-ENTRY. |
| start_mark = self.get_mark() |
| self.forward() |
| end_mark = self.get_mark() |
| self.tokens.append(BlockEntryToken(start_mark, end_mark)) |
| |
| def fetch_key(self): |
| |
| # Block context needs additional checks. |
| if not self.flow_level: |
| |
| # Are we allowed to start a key (not necessary a simple)? |
| if not self.allow_simple_key: |
| raise ScannerError( |
| None, None, "mapping keys are not allowed here", self.get_mark() |
| ) |
| |
| # We may need to add BLOCK-MAPPING-START. |
| if self.add_indent(self.column): |
| mark = self.get_mark() |
| self.tokens.append(BlockMappingStartToken(mark, mark)) |
| |
| # Simple keys are allowed after '?' in the block context. |
| self.allow_simple_key = not self.flow_level |
| |
| # Reset possible simple key on the current level. |
| self.remove_possible_simple_key() |
| |
| # Add KEY. |
| start_mark = self.get_mark() |
| self.forward() |
| end_mark = self.get_mark() |
| self.tokens.append(KeyToken(start_mark, end_mark)) |
| |
| def fetch_value(self): |
| |
| # Do we determine a simple key? |
| if self.flow_level in self.possible_simple_keys: |
| |
| # Add KEY. |
| key = self.possible_simple_keys[self.flow_level] |
| del self.possible_simple_keys[self.flow_level] |
| self.tokens.insert( |
| key.token_number - self.tokens_taken, KeyToken(key.mark, key.mark) |
| ) |
| |
| # If this key starts a new block mapping, we need to add |
| # BLOCK-MAPPING-START. |
| if not self.flow_level: |
| if self.add_indent(key.column): |
| self.tokens.insert( |
| key.token_number - self.tokens_taken, |
| BlockMappingStartToken(key.mark, key.mark), |
| ) |
| |
| # There cannot be two simple keys one after another. |
| self.allow_simple_key = False |
| |
| # It must be a part of a complex key. |
| else: |
| |
| # Block context needs additional checks. |
| # (Do we really need them? They will be caught by the parser |
| # anyway.) |
| if not self.flow_level: |
| |
| # We are allowed to start a complex value if and only if |
| # we can start a simple key. |
| if not self.allow_simple_key: |
| raise ScannerError( |
| None, |
| None, |
| "mapping values are not allowed here", |
| self.get_mark(), |
| ) |
| |
| # If this value starts a new block mapping, we need to add |
| # BLOCK-MAPPING-START. It will be detected as an error later by |
| # the parser. |
| if not self.flow_level: |
| if self.add_indent(self.column): |
| mark = self.get_mark() |
| self.tokens.append(BlockMappingStartToken(mark, mark)) |
| |
| # Simple keys are allowed after ':' in the block context. |
| self.allow_simple_key = not self.flow_level |
| |
| # Reset possible simple key on the current level. |
| self.remove_possible_simple_key() |
| |
| # Add VALUE. |
| start_mark = self.get_mark() |
| self.forward() |
| end_mark = self.get_mark() |
| self.tokens.append(ValueToken(start_mark, end_mark)) |
| |
| def fetch_alias(self): |
| |
| # ALIAS could be a simple key. |
| self.save_possible_simple_key() |
| |
| # No simple keys after ALIAS. |
| self.allow_simple_key = False |
| |
| # Scan and add ALIAS. |
| self.tokens.append(self.scan_anchor(AliasToken)) |
| |
| def fetch_anchor(self): |
| |
| # ANCHOR could start a simple key. |
| self.save_possible_simple_key() |
| |
| # No simple keys after ANCHOR. |
| self.allow_simple_key = False |
| |
| # Scan and add ANCHOR. |
| self.tokens.append(self.scan_anchor(AnchorToken)) |
| |
| def fetch_tag(self): |
| |
| # TAG could start a simple key. |
| self.save_possible_simple_key() |
| |
| # No simple keys after TAG. |
| self.allow_simple_key = False |
| |
| # Scan and add TAG. |
| self.tokens.append(self.scan_tag()) |
| |
| def fetch_literal(self): |
| self.fetch_block_scalar(style="|") |
| |
| def fetch_folded(self): |
| self.fetch_block_scalar(style=">") |
| |
| def fetch_block_scalar(self, style): |
| |
| # A simple key may follow a block scalar. |
| self.allow_simple_key = True |
| |
| # Reset possible simple key on the current level. |
| self.remove_possible_simple_key() |
| |
| # Scan and add SCALAR. |
| self.tokens.append(self.scan_block_scalar(style)) |
| |
| def fetch_single(self): |
| self.fetch_flow_scalar(style="'") |
| |
| def fetch_double(self): |
| self.fetch_flow_scalar(style='"') |
| |
| def fetch_flow_scalar(self, style): |
| |
| # A flow scalar could be a simple key. |
| self.save_possible_simple_key() |
| |
| # No simple keys after flow scalars. |
| self.allow_simple_key = False |
| |
| # Scan and add SCALAR. |
| self.tokens.append(self.scan_flow_scalar(style)) |
| |
| def fetch_plain(self): |
| |
| # A plain scalar could be a simple key. |
| self.save_possible_simple_key() |
| |
| # No simple keys after plain scalars. But note that `scan_plain` will |
| # change this flag if the scan is finished at the beginning of the |
| # line. |
| self.allow_simple_key = False |
| |
| # Scan and add SCALAR. May change `allow_simple_key`. |
| self.tokens.append(self.scan_plain()) |
| |
| # Checkers. |
| |
| def check_directive(self): |
| |
| # DIRECTIVE: ^ '%' ... |
| # The '%' indicator is already checked. |
| if self.column == 0: |
| return True |
| |
| def check_document_start(self): |
| |
| # DOCUMENT-START: ^ '---' (' '|'\n') |
| if self.column == 0: |
| if self.prefix(3) == "---" and self.peek(3) in "\0 \t\r\n\x85\u2028\u2029": |
| return True |
| |
| def check_document_end(self): |
| |
| # DOCUMENT-END: ^ '...' (' '|'\n') |
| if self.column == 0: |
| if self.prefix(3) == "..." and self.peek(3) in "\0 \t\r\n\x85\u2028\u2029": |
| return True |
| |
| def check_block_entry(self): |
| |
| # BLOCK-ENTRY: '-' (' '|'\n') |
| return self.peek(1) in "\0 \t\r\n\x85\u2028\u2029" |
| |
| def check_key(self): |
| |
| # KEY(flow context): '?' |
| if self.flow_level: |
| return True |
| |
| # KEY(block context): '?' (' '|'\n') |
| else: |
| return self.peek(1) in "\0 \t\r\n\x85\u2028\u2029" |
| |
| def check_value(self): |
| |
| # VALUE(flow context): ':' |
| if self.flow_level: |
| return True |
| |
| # VALUE(block context): ':' (' '|'\n') |
| else: |
| return self.peek(1) in "\0 \t\r\n\x85\u2028\u2029" |
| |
| def check_plain(self): |
| |
| # A plain scalar may start with any non-space character except: |
| # '-', '?', ':', ',', '[', ']', '{', '}', |
| # '#', '&', '*', '!', '|', '>', '\'', '\"', |
| # '%', '@', '`'. |
| # |
| # It may also start with |
| # '-', '?', ':' |
| # if it is followed by a non-space character. |
| # |
| # Note that we limit the last rule to the block context (except the |
| # '-' character) because we want the flow context to be space |
| # independent. |
| ch = self.peek() |
| return ch not in "\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>'\"%@`" or ( |
| self.peek(1) not in "\0 \t\r\n\x85\u2028\u2029" |
| and (ch == "-" or (not self.flow_level and ch in "?:")) |
| ) |
| |
| # Scanners. |
| |
| def scan_to_next_token(self): |
| # We ignore spaces, line breaks and comments. |
| # If we find a line break in the block context, we set the flag |
| # `allow_simple_key` on. |
| # The byte order mark is stripped if it's the first character in the |
| # stream. We do not yet support BOM inside the stream as the |
| # specification requires. Any such mark will be considered as a part |
| # of the document. |
| # |
| # TODO: We need to make tab handling rules more sane. A good rule is |
| # Tabs cannot precede tokens |
| # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, |
| # KEY(block), VALUE(block), BLOCK-ENTRY |
| # So the checking code is |
| # if <TAB>: |
| # self.allow_simple_keys = False |
| # We also need to add the check for `allow_simple_keys == True` to |
| # `unwind_indent` before issuing BLOCK-END. |
| # Scanners for block, flow, and plain scalars need to be modified. |
| |
| if self.index == 0 and self.peek() == "\uFEFF": |
| self.forward() |
| found = False |
| while not found: |
| while self.peek() == " ": |
| self.forward() |
| if self.peek() == "#": |
| while self.peek() not in "\0\r\n\x85\u2028\u2029": |
| self.forward() |
| if self.scan_line_break(): |
| if not self.flow_level: |
| self.allow_simple_key = True |
| else: |
| found = True |
| |
| def scan_directive(self): |
| # See the specification for details. |
| start_mark = self.get_mark() |
| self.forward() |
| name = self.scan_directive_name(start_mark) |
| value = None |
| if name == "YAML": |
| value = self.scan_yaml_directive_value(start_mark) |
| end_mark = self.get_mark() |
| elif name == "TAG": |
| value = self.scan_tag_directive_value(start_mark) |
| end_mark = self.get_mark() |
| else: |
| end_mark = self.get_mark() |
| while self.peek() not in "\0\r\n\x85\u2028\u2029": |
| self.forward() |
| self.scan_directive_ignored_line(start_mark) |
| return DirectiveToken(name, value, start_mark, end_mark) |
| |
| def scan_directive_name(self, start_mark): |
| # See the specification for details. |
| length = 0 |
| ch = self.peek(length) |
| while "0" <= ch <= "9" or "A" <= ch <= "Z" or "a" <= ch <= "z" or ch in "-_": |
| length += 1 |
| ch = self.peek(length) |
| if not length: |
| raise ScannerError( |
| "while scanning a directive", |
| start_mark, |
| "expected alphabetic or numeric character, but found %r" % ch, |
| self.get_mark(), |
| ) |
| value = self.prefix(length) |
| self.forward(length) |
| ch = self.peek() |
| if ch not in "\0 \r\n\x85\u2028\u2029": |
| raise ScannerError( |
| "while scanning a directive", |
| start_mark, |
| "expected alphabetic or numeric character, but found %r" % ch, |
| self.get_mark(), |
| ) |
| return value |
| |
| def scan_yaml_directive_value(self, start_mark): |
| # See the specification for details. |
| while self.peek() == " ": |
| self.forward() |
| major = self.scan_yaml_directive_number(start_mark) |
| if self.peek() != ".": |
| raise ScannerError( |
| "while scanning a directive", |
| start_mark, |
| "expected a digit or '.', but found %r" % self.peek(), |
| self.get_mark(), |
| ) |
| self.forward() |
| minor = self.scan_yaml_directive_number(start_mark) |
| if self.peek() not in "\0 \r\n\x85\u2028\u2029": |
| raise ScannerError( |
| "while scanning a directive", |
| start_mark, |
| "expected a digit or ' ', but found %r" % self.peek(), |
| self.get_mark(), |
| ) |
| return (major, minor) |
| |
| def scan_yaml_directive_number(self, start_mark): |
| # See the specification for details. |
| ch = self.peek() |
| if not ("0" <= ch <= "9"): |
| raise ScannerError( |
| "while scanning a directive", |
| start_mark, |
| "expected a digit, but found %r" % ch, |
| self.get_mark(), |
| ) |
| length = 0 |
| while "0" <= self.peek(length) <= "9": |
| length += 1 |
| value = int(self.prefix(length)) |
| self.forward(length) |
| return value |
| |
| def scan_tag_directive_value(self, start_mark): |
| # See the specification for details. |
| while self.peek() == " ": |
| self.forward() |
| handle = self.scan_tag_directive_handle(start_mark) |
| while self.peek() == " ": |
| self.forward() |
| prefix = self.scan_tag_directive_prefix(start_mark) |
| return (handle, prefix) |
| |
| def scan_tag_directive_handle(self, start_mark): |
| # See the specification for details. |
| value = self.scan_tag_handle("directive", start_mark) |
| ch = self.peek() |
| if ch != " ": |
| raise ScannerError( |
| "while scanning a directive", |
| start_mark, |
| "expected ' ', but found %r" % ch, |
| self.get_mark(), |
| ) |
| return value |
| |
| def scan_tag_directive_prefix(self, start_mark): |
| # See the specification for details. |
| value = self.scan_tag_uri("directive", start_mark) |
| ch = self.peek() |
| if ch not in "\0 \r\n\x85\u2028\u2029": |
| raise ScannerError( |
| "while scanning a directive", |
| start_mark, |
| "expected ' ', but found %r" % ch, |
| self.get_mark(), |
| ) |
| return value |
| |
| def scan_directive_ignored_line(self, start_mark): |
| # See the specification for details. |
| while self.peek() == " ": |
| self.forward() |
| if self.peek() == "#": |
| while self.peek() not in "\0\r\n\x85\u2028\u2029": |
| self.forward() |
| ch = self.peek() |
| if ch not in "\0\r\n\x85\u2028\u2029": |
| raise ScannerError( |
| "while scanning a directive", |
| start_mark, |
| "expected a comment or a line break, but found %r" % ch, |
| self.get_mark(), |
| ) |
| self.scan_line_break() |
| |
| def scan_anchor(self, TokenClass): |
| # The specification does not restrict characters for anchors and |
| # aliases. This may lead to problems, for instance, the document: |
| # [ *alias, value ] |
| # can be interpreted in two ways, as |
| # [ "value" ] |
| # and |
| # [ *alias , "value" ] |
| # Therefore we restrict aliases to numbers and ASCII letters. |
| start_mark = self.get_mark() |
| indicator = self.peek() |
| if indicator == "*": |
| name = "alias" |
| else: |
| name = "anchor" |
| self.forward() |
| length = 0 |
| ch = self.peek(length) |
| while "0" <= ch <= "9" or "A" <= ch <= "Z" or "a" <= ch <= "z" or ch in "-_": |
| length += 1 |
| ch = self.peek(length) |
| if not length: |
| raise ScannerError( |
| "while scanning an %s" % name, |
| start_mark, |
| "expected alphabetic or numeric character, but found %r" % ch, |
| self.get_mark(), |
| ) |
| value = self.prefix(length) |
| self.forward(length) |
| ch = self.peek() |
| if ch not in "\0 \t\r\n\x85\u2028\u2029?:,]}%@`": |
| raise ScannerError( |
| "while scanning an %s" % name, |
| start_mark, |
| "expected alphabetic or numeric character, but found %r" % ch, |
| self.get_mark(), |
| ) |
| end_mark = self.get_mark() |
| return TokenClass(value, start_mark, end_mark) |
| |
| def scan_tag(self): |
| # See the specification for details. |
| start_mark = self.get_mark() |
| ch = self.peek(1) |
| if ch == "<": |
| handle = None |
| self.forward(2) |
| suffix = self.scan_tag_uri("tag", start_mark) |
| if self.peek() != ">": |
| raise ScannerError( |
| "while parsing a tag", |
| start_mark, |
| "expected '>', but found %r" % self.peek(), |
| self.get_mark(), |
| ) |
| self.forward() |
| elif ch in "\0 \t\r\n\x85\u2028\u2029": |
| handle = None |
| suffix = "!" |
| self.forward() |
| else: |
| length = 1 |
| use_handle = False |
| while ch not in "\0 \r\n\x85\u2028\u2029": |
| if ch == "!": |
| use_handle = True |
| break |
| length += 1 |
| ch = self.peek(length) |
| handle = "!" |
| if use_handle: |
| handle = self.scan_tag_handle("tag", start_mark) |
| else: |
| handle = "!" |
| self.forward() |
| suffix = self.scan_tag_uri("tag", start_mark) |
| ch = self.peek() |
| if ch not in "\0 \r\n\x85\u2028\u2029": |
| raise ScannerError( |
| "while scanning a tag", |
| start_mark, |
| "expected ' ', but found %r" % ch, |
| self.get_mark(), |
| ) |
| value = (handle, suffix) |
| end_mark = self.get_mark() |
| return TagToken(value, start_mark, end_mark) |
| |
| def scan_block_scalar(self, style): |
| # See the specification for details. |
| |
| if style == ">": |
| folded = True |
| else: |
| folded = False |
| |
| chunks = [] |
| start_mark = self.get_mark() |
| |
| # Scan the header. |
| self.forward() |
| chomping, increment = self.scan_block_scalar_indicators(start_mark) |
| self.scan_block_scalar_ignored_line(start_mark) |
| |
| # Determine the indentation level and go to the first non-empty line. |
| min_indent = self.indent + 1 |
| if min_indent < 1: |
| min_indent = 1 |
| if increment is None: |
| breaks, max_indent, end_mark = self.scan_block_scalar_indentation() |
| indent = max(min_indent, max_indent) |
| else: |
| indent = min_indent + increment - 1 |
| breaks, end_mark = self.scan_block_scalar_breaks(indent) |
| line_break = "" |
| |
| # Scan the inner part of the block scalar. |
| while self.column == indent and self.peek() != "\0": |
| chunks.extend(breaks) |
| leading_non_space = self.peek() not in " \t" |
| length = 0 |
| while self.peek(length) not in "\0\r\n\x85\u2028\u2029": |
| length += 1 |
| chunks.append(self.prefix(length)) |
| self.forward(length) |
| line_break = self.scan_line_break() |
| breaks, end_mark = self.scan_block_scalar_breaks(indent) |
| if self.column == indent and self.peek() != "\0": |
| |
| # Unfortunately, folding rules are ambiguous. |
| # |
| # This is the folding according to the specification: |
| |
| if ( |
| folded |
| and line_break == "\n" |
| and leading_non_space |
| and self.peek() not in " \t" |
| ): |
| if not breaks: |
| chunks.append(" ") |
| else: |
| chunks.append(line_break) |
| |
| # This is Clark Evans's interpretation (also in the spec |
| # examples): |
| # |
| # if folded and line_break == '\n': |
| # if not breaks: |
| # if self.peek() not in ' \t': |
| # chunks.append(' ') |
| # else: |
| # chunks.append(line_break) |
| # else: |
| # chunks.append(line_break) |
| else: |
| break |
| |
| # Chomp the tail. |
| if chomping is not False: |
| chunks.append(line_break) |
| if chomping is True: |
| chunks.extend(breaks) |
| |
| # We are done. |
| return ScalarToken("".join(chunks), False, start_mark, end_mark, style) |
| |
| def scan_block_scalar_indicators(self, start_mark): |
| # See the specification for details. |
| chomping = None |
| increment = None |
| ch = self.peek() |
| if ch in "+-": |
| if ch == "+": |
| chomping = True |
| else: |
| chomping = False |
| self.forward() |
| ch = self.peek() |
| if ch in "0123456789": |
| increment = int(ch) |
| if increment == 0: |
| raise ScannerError( |
| "while scanning a block scalar", |
| start_mark, |
| "expected indentation indicator in the range 1-9, but found 0", |
| self.get_mark(), |
| ) |
| self.forward() |
| elif ch in "0123456789": |
| increment = int(ch) |
| if increment == 0: |
| raise ScannerError( |
| "while scanning a block scalar", |
| start_mark, |
| "expected indentation indicator in the range 1-9, but found 0", |
| self.get_mark(), |
| ) |
| self.forward() |
| ch = self.peek() |
| if ch in "+-": |
| if ch == "+": |
| chomping = True |
| else: |
| chomping = False |
| self.forward() |
| ch = self.peek() |
| if ch not in "\0 \r\n\x85\u2028\u2029": |
| raise ScannerError( |
| "while scanning a block scalar", |
| start_mark, |
| "expected chomping or indentation indicators, but found %r" % ch, |
| self.get_mark(), |
| ) |
| return chomping, increment |
| |
| def scan_block_scalar_ignored_line(self, start_mark): |
| # See the specification for details. |
| while self.peek() == " ": |
| self.forward() |
| if self.peek() == "#": |
| while self.peek() not in "\0\r\n\x85\u2028\u2029": |
| self.forward() |
| ch = self.peek() |
| if ch not in "\0\r\n\x85\u2028\u2029": |
| raise ScannerError( |
| "while scanning a block scalar", |
| start_mark, |
| "expected a comment or a line break, but found %r" % ch, |
| self.get_mark(), |
| ) |
| self.scan_line_break() |
| |
| def scan_block_scalar_indentation(self): |
| # See the specification for details. |
| chunks = [] |
| max_indent = 0 |
| end_mark = self.get_mark() |
| while self.peek() in " \r\n\x85\u2028\u2029": |
| if self.peek() != " ": |
| chunks.append(self.scan_line_break()) |
| end_mark = self.get_mark() |
| else: |
| self.forward() |
| if self.column > max_indent: |
| max_indent = self.column |
| return chunks, max_indent, end_mark |
| |
| def scan_block_scalar_breaks(self, indent): |
| # See the specification for details. |
| chunks = [] |
| end_mark = self.get_mark() |
| while self.column < indent and self.peek() == " ": |
| self.forward() |
| while self.peek() in "\r\n\x85\u2028\u2029": |
| chunks.append(self.scan_line_break()) |
| end_mark = self.get_mark() |
| while self.column < indent and self.peek() == " ": |
| self.forward() |
| return chunks, end_mark |
| |
| def scan_flow_scalar(self, style): |
| # See the specification for details. |
| # Note that we loose indentation rules for quoted scalars. Quoted |
| # scalars don't need to adhere indentation because " and ' clearly |
| # mark the beginning and the end of them. Therefore we are less |
| # restrictive then the specification requires. We only need to check |
| # that document separators are not included in scalars. |
| if style == '"': |
| double = True |
| else: |
| double = False |
| chunks = [] |
| start_mark = self.get_mark() |
| quote = self.peek() |
| self.forward() |
| chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) |
| while self.peek() != quote: |
| chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) |
| chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) |
| self.forward() |
| end_mark = self.get_mark() |
| return ScalarToken("".join(chunks), False, start_mark, end_mark, style) |
| |
| ESCAPE_REPLACEMENTS = { |
| "0": "\0", |
| "a": "\x07", |
| "b": "\x08", |
| "t": "\x09", |
| "\t": "\x09", |
| "n": "\x0A", |
| "v": "\x0B", |
| "f": "\x0C", |
| "r": "\x0D", |
| "e": "\x1B", |
| " ": "\x20", |
| '"': '"', |
| "\\": "\\", |
| "/": "/", |
| "N": "\x85", |
| "_": "\xA0", |
| "L": "\u2028", |
| "P": "\u2029", |
| } |
| |
| ESCAPE_CODES = { |
| "x": 2, |
| "u": 4, |
| "U": 8, |
| } |
| |
| def scan_flow_scalar_non_spaces(self, double, start_mark): |
| # See the specification for details. |
| chunks = [] |
| while True: |
| length = 0 |
| while self.peek(length) not in "'\"\\\0 \t\r\n\x85\u2028\u2029": |
| length += 1 |
| if length: |
| chunks.append(self.prefix(length)) |
| self.forward(length) |
| ch = self.peek() |
| if not double and ch == "'" and self.peek(1) == "'": |
| chunks.append("'") |
| self.forward(2) |
| elif (double and ch == "'") or (not double and ch in '"\\'): |
| chunks.append(ch) |
| self.forward() |
| elif double and ch == "\\": |
| self.forward() |
| ch = self.peek() |
| if ch in self.ESCAPE_REPLACEMENTS: |
| chunks.append(self.ESCAPE_REPLACEMENTS[ch]) |
| self.forward() |
| elif ch in self.ESCAPE_CODES: |
| length = self.ESCAPE_CODES[ch] |
| self.forward() |
| for k in range(length): |
| if self.peek(k) not in "0123456789ABCDEFabcdef": |
| raise ScannerError( |
| "while scanning a double-quoted scalar", |
| start_mark, |
| "expected escape sequence of %d hexadecimal numbers, but found %r" |
| % (length, self.peek(k)), |
| self.get_mark(), |
| ) |
| code = int(self.prefix(length), 16) |
| chunks.append(chr(code)) |
| self.forward(length) |
| elif ch in "\r\n\x85\u2028\u2029": |
| self.scan_line_break() |
| chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) |
| else: |
| raise ScannerError( |
| "while scanning a double-quoted scalar", |
| start_mark, |
| "found unknown escape character %r" % ch, |
| self.get_mark(), |
| ) |
| else: |
| return chunks |
| |
| def scan_flow_scalar_spaces(self, double, start_mark): |
| # See the specification for details. |
| chunks = [] |
| length = 0 |
| while self.peek(length) in " \t": |
| length += 1 |
| whitespaces = self.prefix(length) |
| self.forward(length) |
| ch = self.peek() |
| if ch == "\0": |
| raise ScannerError( |
| "while scanning a quoted scalar", |
| start_mark, |
| "found unexpected end of stream", |
| self.get_mark(), |
| ) |
| elif ch in "\r\n\x85\u2028\u2029": |
| line_break = self.scan_line_break() |
| breaks = self.scan_flow_scalar_breaks(double, start_mark) |
| if line_break != "\n": |
| chunks.append(line_break) |
| elif not breaks: |
| chunks.append(" ") |
| chunks.extend(breaks) |
| else: |
| chunks.append(whitespaces) |
| return chunks |
| |
| def scan_flow_scalar_breaks(self, double, start_mark): |
| # See the specification for details. |
| chunks = [] |
| while True: |
| # Instead of checking indentation, we check for document |
| # separators. |
| prefix = self.prefix(3) |
| if (prefix == "---" or prefix == "...") and self.peek( |
| 3 |
| ) in "\0 \t\r\n\x85\u2028\u2029": |
| raise ScannerError( |
| "while scanning a quoted scalar", |
| start_mark, |
| "found unexpected document separator", |
| self.get_mark(), |
| ) |
| while self.peek() in " \t": |
| self.forward() |
| if self.peek() in "\r\n\x85\u2028\u2029": |
| chunks.append(self.scan_line_break()) |
| else: |
| return chunks |
| |
| def scan_plain(self): |
| # See the specification for details. |
| # We add an additional restriction for the flow context: |
| # plain scalars in the flow context cannot contain ',' or '?'. |
| # We also keep track of the `allow_simple_key` flag here. |
| # Indentation rules are loosed for the flow context. |
| chunks = [] |
| start_mark = self.get_mark() |
| end_mark = start_mark |
| indent = self.indent + 1 |
| # We allow zero indentation for scalars, but then we need to check for |
| # document separators at the beginning of the line. |
| # if indent == 0: |
| # indent = 1 |
| spaces = [] |
| while True: |
| length = 0 |
| if self.peek() == "#": |
| break |
| while True: |
| ch = self.peek(length) |
| if ( |
| ch in "\0 \t\r\n\x85\u2028\u2029" |
| or ( |
| ch == ":" |
| and self.peek(length + 1) |
| in "\0 \t\r\n\x85\u2028\u2029" |
| + (",[]{}" if self.flow_level else "") |
| ) |
| or (self.flow_level and ch in ",?[]{}") |
| ): |
| break |
| length += 1 |
| if length == 0: |
| break |
| self.allow_simple_key = False |
| chunks.extend(spaces) |
| chunks.append(self.prefix(length)) |
| self.forward(length) |
| end_mark = self.get_mark() |
| spaces = self.scan_plain_spaces(indent, start_mark) |
| if ( |
| not spaces |
| or self.peek() == "#" |
| or (not self.flow_level and self.column < indent) |
| ): |
| break |
| return ScalarToken("".join(chunks), True, start_mark, end_mark) |
| |
| def scan_plain_spaces(self, indent, start_mark): |
| # See the specification for details. |
| # The specification is really confusing about tabs in plain scalars. |
| # We just forbid them completely. Do not use tabs in YAML! |
| chunks = [] |
| length = 0 |
| while self.peek(length) in " ": |
| length += 1 |
| whitespaces = self.prefix(length) |
| self.forward(length) |
| ch = self.peek() |
| if ch in "\r\n\x85\u2028\u2029": |
| line_break = self.scan_line_break() |
| self.allow_simple_key = True |
| prefix = self.prefix(3) |
| if (prefix == "---" or prefix == "...") and self.peek( |
| 3 |
| ) in "\0 \t\r\n\x85\u2028\u2029": |
| return |
| breaks = [] |
| while self.peek() in " \r\n\x85\u2028\u2029": |
| if self.peek() == " ": |
| self.forward() |
| else: |
| breaks.append(self.scan_line_break()) |
| prefix = self.prefix(3) |
| if (prefix == "---" or prefix == "...") and self.peek( |
| 3 |
| ) in "\0 \t\r\n\x85\u2028\u2029": |
| return |
| if line_break != "\n": |
| chunks.append(line_break) |
| elif not breaks: |
| chunks.append(" ") |
| chunks.extend(breaks) |
| elif whitespaces: |
| chunks.append(whitespaces) |
| return chunks |
| |
| def scan_tag_handle(self, name, start_mark): |
| # See the specification for details. |
| # For some strange reasons, the specification does not allow '_' in |
| # tag handles. I have allowed it anyway. |
| ch = self.peek() |
| if ch != "!": |
| raise ScannerError( |
| "while scanning a %s" % name, |
| start_mark, |
| "expected '!', but found %r" % ch, |
| self.get_mark(), |
| ) |
| length = 1 |
| ch = self.peek(length) |
| if ch != " ": |
| while ( |
| "0" <= ch <= "9" or "A" <= ch <= "Z" or "a" <= ch <= "z" or ch in "-_" |
| ): |
| length += 1 |
| ch = self.peek(length) |
| if ch != "!": |
| self.forward(length) |
| raise ScannerError( |
| "while scanning a %s" % name, |
| start_mark, |
| "expected '!', but found %r" % ch, |
| self.get_mark(), |
| ) |
| length += 1 |
| value = self.prefix(length) |
| self.forward(length) |
| return value |
| |
| def scan_tag_uri(self, name, start_mark): |
| # See the specification for details. |
| # Note: we do not check if URI is well-formed. |
| chunks = [] |
| length = 0 |
| ch = self.peek(length) |
| while ( |
| "0" <= ch <= "9" |
| or "A" <= ch <= "Z" |
| or "a" <= ch <= "z" |
| or ch in "-;/?:@&=+$,_.!~*'()[]%" |
| ): |
| if ch == "%": |
| chunks.append(self.prefix(length)) |
| self.forward(length) |
| length = 0 |
| chunks.append(self.scan_uri_escapes(name, start_mark)) |
| else: |
| length += 1 |
| ch = self.peek(length) |
| if length: |
| chunks.append(self.prefix(length)) |
| self.forward(length) |
| length = 0 |
| if not chunks: |
| raise ScannerError( |
| "while parsing a %s" % name, |
| start_mark, |
| "expected URI, but found %r" % ch, |
| self.get_mark(), |
| ) |
| return "".join(chunks) |
| |
| def scan_uri_escapes(self, name, start_mark): |
| # See the specification for details. |
| codes = [] |
| mark = self.get_mark() |
| while self.peek() == "%": |
| self.forward() |
| for k in range(2): |
| if self.peek(k) not in "0123456789ABCDEFabcdef": |
| raise ScannerError( |
| "while scanning a %s" % name, |
| start_mark, |
| "expected URI escape sequence of 2 hexadecimal numbers, but found %r" |
| % self.peek(k), |
| self.get_mark(), |
| ) |
| codes.append(int(self.prefix(2), 16)) |
| self.forward(2) |
| try: |
| value = bytes(codes).decode("utf-8") |
| except UnicodeDecodeError as exc: |
| raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark) |
| return value |
| |
| def scan_line_break(self): |
| # Transforms: |
| # '\r\n' : '\n' |
| # '\r' : '\n' |
| # '\n' : '\n' |
| # '\x85' : '\n' |
| # '\u2028' : '\u2028' |
| # '\u2029 : '\u2029' |
| # default : '' |
| ch = self.peek() |
| if ch in "\r\n\x85": |
| if self.prefix(2) == "\r\n": |
| self.forward(2) |
| else: |
| self.forward() |
| return "\n" |
| elif ch in "\u2028\u2029": |
| self.forward() |
| return ch |
| return "" |