Source code for superdiff.parser

import re
from typing import Sequence


_NEWLINE_CHARS = r'(\r\n)|(\r)|(\n)'  # KEEP THESE IN ORDER
_NEWLINE_THEN_WHITESPACE = r'({newline}|[ \t])*'.format(newline=_NEWLINE_CHARS)


[docs]class Parser: def __init__(self, ignore_case=False, ignore_non_newline_whitespace=False, ignore_non_newline_whitespace_changes=False, ignore_newline_changes=False, ignore_blank_lines=False, ignore_leading_whitespace=False, ignore_trailing_whitespace=False): self._newline_regex = '({})'.format(_NEWLINE_CHARS) if ignore_newline_changes: self._newline_regex += '+' if ignore_blank_lines: # When ignoring blank lines, we match any of the # following (in order): # 1. A 'newline sandwich' (any amount or kind of # whitespace surrounded by 2 newlines) followed # optionally by any amount of non-newline whitespace # and the end of the string. # 2. A single newline char self._newline_regex = ( r'({newline})({newline_then_whitespace})({newline})(([ \t]*$)?)|' '({newline})').format(newline=_NEWLINE_CHARS, newline_then_whitespace=_NEWLINE_THEN_WHITESPACE) self._whitespace_regex = '[ \t]' if ignore_non_newline_whitespace_changes: # One or more whitespace chars self._whitespace_regex += '+' # One or more non-whitespace chars. self._word_regex = r'\S+' self._settings = self.Settings( ignore_case=ignore_case, ignore_non_newline_whitespace=ignore_non_newline_whitespace, ignore_non_newline_whitespace_changes=ignore_non_newline_whitespace_changes, ignore_newline_changes=ignore_newline_changes, ignore_blank_lines=ignore_blank_lines, ignore_leading_whitespace=ignore_leading_whitespace, ignore_trailing_whitespace=ignore_trailing_whitespace, )
[docs] class Settings: # NOTE: we're only supporting \n \r and \r\n as newlines def __init__(self, ignore_case=False, ignore_non_newline_whitespace=False, ignore_non_newline_whitespace_changes=False, ignore_newline_changes=False, ignore_blank_lines=False, ignore_leading_whitespace=False, ignore_trailing_whitespace=False): self.ignore_case = ignore_case self.ignore_non_newline_whitespace = ignore_non_newline_whitespace self.ignore_non_newline_whitespace_changes = ignore_non_newline_whitespace_changes self.ignore_newline_changes = ignore_newline_changes self.ignore_blank_lines = ignore_blank_lines self.ignore_leading_whitespace = ignore_leading_whitespace self.ignore_trailing_whitespace = ignore_trailing_whitespace
def _get_token_spec(self) -> Sequence[tuple]: return [ # IMPORTANT: DO NOT CHANGE THE ORDER OF THESE!!!! ('newline', self._newline_regex), ('whitespace', self._whitespace_regex), ('word', self._word_regex) ]
[docs] def parse(self, text: str) -> Sequence['Line']: lines = [] token_regex = '|'.join( '(?P<{0}>{1})'.format(token_type, regex) for token_type, regex in self._get_token_spec()) tokens = [] for match in re.finditer(token_regex, text): token = token_factory(match.lastgroup, match, self._settings) tokens.append(token) if isinstance(token, NewlineToken) or match.end() == len(text): lines.append(Line(tokens, self._settings)) tokens = [] return lines
[docs]class Line: ''' A line consists of a series of Tokens, with the final token being a NewlineToken. ''' def __init__(self, tokens: Sequence['Token'], settings: Parser.Settings) -> None: self._tokens = tokens self._settings = settings self._hash = None # type: int @property def transformed_text(self) -> str: text = ''.join(token.transformed_text for token in self._tokens) if (self._settings.ignore_leading_whitespace and self._settings.ignore_trailing_whitespace): text = text.strip() elif self._settings.ignore_leading_whitespace: text = text.lstrip() elif self._settings.ignore_trailing_whitespace: text = text.rstrip() return text @property def original_text(self) -> str: return ''.join(token.original_text for token in self._tokens) def __hash__(self): if self._hash is None: self._hash = hash(self.transformed_text) return self._hash def __eq__(self, other): if not isinstance(other, Line): return False return hash(self) == hash(other) def __str__(self): return self.original_text
[docs]class Token: def __init__(self, regex_match, settings: Parser.Settings) -> None: self._regex_match = regex_match self._text = self._get_matched_text() self._settings = settings self._transformed_text = None # type: str def _get_matched_text(self) -> str: return self._regex_match.group() @property def original_text(self) -> str: return self._text @property def transformed_text(self) -> str: if self._transformed_text is None: self._transformed_text = self._get_transformed_text() return self._transformed_text def _get_transformed_text(self) -> str: return self._text
[docs]class WordToken(Token): ''' A WordToken stores a string of non-whitespace characters. ''' def _get_transformed_text(self) -> str: text = self._text if self._settings.ignore_case: text = text.lower() return text
[docs]class NewlineToken(Token): ''' A NewlineNode stores one or more newline characters. ''' def __init__(self, regex_match, settings: Parser.Settings) -> None: super().__init__(regex_match, settings) def _get_transformed_text(self) -> str: if self._settings.ignore_newline_changes: return '\n' if self._settings.ignore_blank_lines: return re.match(_NEWLINE_CHARS, self._text).group() return super()._get_transformed_text()
[docs]class WhitespaceToken(Token): ''' An InlineWhitespaceNode stores one or more non-newline whitespace characters (i.e. tabs and spaces). ''' def _get_transformed_text(self): if self._settings.ignore_non_newline_whitespace: return '' if self._settings.ignore_non_newline_whitespace_changes: return ' ' return super()._get_transformed_text()
[docs]def token_factory(token_type: str, regex_match, parser_settings) -> Token: ''' Instantiates a token of the specified type. ''' return _TOKEN_TYPES[token_type](regex_match, parser_settings)
_TOKEN_TYPES = { 'word': WordToken, 'newline': NewlineToken, 'whitespace': WhitespaceToken, }