Merge pull request #1452 from lark-parser/textslice

erezsh · web-flow · commit 1640da56f6c2 · 2025-03-21T10:03:36.000+01:00
Added TextSlice; Lark can now parse/lex a text-slice
diff --git a/docs/classes.rst b/docs/classes.rst
@@ -96,3 +96,8 @@ Indenter
 
 .. autoclass:: lark.indenter.Indenter
 .. autoclass:: lark.indenter.PythonIndenter
+
+TextSlice
+---------
+
+.. autoclass:: lark.utils.TextSlice
diff --git a/lark/__init__.py b/lark/__init__.py
@@ -11,7 +11,7 @@
 from .lark import Lark
 from .lexer import Token
 from .tree import ParseTree, Tree
-from .utils import logger
+from .utils import logger, TextSlice
 from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
 
 __version__: str = "1.2.2"
@@ -33,6 +33,7 @@
     "Discard",
     "Transformer",
     "Transformer_NonRecursive",
+    "TextSlice",
     "Visitor",
     "v_args",
 )
diff --git a/lark/lark.py b/lark/lark.py
@@ -16,7 +16,7 @@
     from .parser_frontends import ParsingFrontend
 
 from .exceptions import ConfigurationError, assert_config, UnexpectedInput
-from .utils import Serialize, SerializeMemoizer, FS, logger
+from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice
 from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
 from .tree import Tree
 from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
@@ -598,7 +598,7 @@ def __repr__(self):
         return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
 
 
-    def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
+    def lex(self, text: TextOrSlice, dont_ignore: bool=False) -> Iterator[Token]:
         """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'
 
         When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
@@ -620,11 +620,11 @@ def get_terminal(self, name: str) -> TerminalDef:
         """Get information about a terminal"""
         return self._terminals_dict[name]
 
-    def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
-        """Start an interactive parsing session.
+    def parse_interactive(self, text: Optional[TextOrSlice]=None, start: Optional[str]=None) -> 'InteractiveParser':
+        """Start an interactive parsing session. Only works when parser='lalr'.
 
         Parameters:
-            text (str, optional): Text to be parsed. Required for ``resume_parse()``.
+            text (TextOrSlice, optional): Text to be parsed. Required for ``resume_parse()``.
             start (str, optional): Start symbol
 
         Returns:
@@ -634,13 +634,15 @@ def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None)
         """
         return self.parser.parse_interactive(text, start=start)
 
-    def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
+    def parse(self, text: TextOrSlice, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
         """Parse the given text, according to the options provided.
 
         Parameters:
-            text (str): Text to be parsed.
+            text (TextOrSlice): Text to be parsed, as `str` or `bytes`.
+                TextSlice may also be used, but only when lexer='basic' or 'contextual'.
             start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
-            on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing.
+            on_error (function, optional): if provided, will be called on UnexpectedInput error,
+                with the exception as its argument. Return true to resume parsing, or false to raise the exception.
                 LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.
 
         Returns:
diff --git a/lark/lexer.py b/lark/lexer.py
@@ -17,7 +17,7 @@
     from .common import LexerConf
     from .parsers.lalr_parser_state import ParserState
 
-from .utils import classify, get_regexp_width, Serialize, logger
+from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice
 from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
 from .grammar import TOKEN_DEFAULT_PRIORITY
 
@@ -289,7 +289,7 @@ def __eq__(self, other):
 
         return self.char_pos == other.char_pos and self.newline_char == other.newline_char
 
-    def feed(self, token: Token, test_newline=True):
+    def feed(self, token: TextOrSlice, test_newline=True):
         """Consume a token and calculate the new line & column.
 
         As an optional optimization, set test_newline=False if token doesn't contain a newline.
@@ -305,13 +305,13 @@ def feed(self, token: Token, test_newline=True):
 
 
 class UnlessCallback:
-    def __init__(self, scanner):
+    def __init__(self, scanner: 'Scanner'):
         self.scanner = scanner
 
-    def __call__(self, t):
-        res = self.scanner.match(t.value, 0)
-        if res:
-            _value, t.type = res
+    def __call__(self, t: Token):
+        res = self.scanner.fullmatch(t.value)
+        if res is not None:
+            t.type = res
         return t
 
 
@@ -347,19 +347,18 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
                 if strtok.pattern.flags <= retok.pattern.flags:
                     embedded_strs.add(strtok)
         if unless:
-            callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
+            callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, use_bytes=use_bytes))
 
     new_terminals = [t for t in terminals if t not in embedded_strs]
     return new_terminals, callback
 
 
 class Scanner:
-    def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
+    def __init__(self, terminals, g_regex_flags, re_, use_bytes):
         self.terminals = terminals
         self.g_regex_flags = g_regex_flags
         self.re_ = re_
         self.use_bytes = use_bytes
-        self.match_whole = match_whole
 
         self.allowed_types = {t.name for t in self.terminals}
 
@@ -369,10 +368,9 @@ def _build_mres(self, terminals, max_size):
         # Python sets an unreasonable group limit (currently 100) in its re module
         # Worse, the only way to know we reached it is by catching an AssertionError!
         # This function recursively tries less and less groups until it's successful.
-        postfix = '$' if self.match_whole else ''
         mres = []
         while terminals:
-            pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
+            pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp()) for t in terminals[:max_size])
             if self.use_bytes:
                 pattern = pattern.encode('latin-1')
             try:
@@ -384,13 +382,20 @@ def _build_mres(self, terminals, max_size):
             terminals = terminals[max_size:]
         return mres
 
-    def match(self, text, pos):
+    def match(self, text: TextSlice, pos):
         for mre in self._mres:
-            m = mre.match(text, pos)
+            m = mre.match(text.text, pos, text.end)
             if m:
                 return m.group(0), m.lastgroup
 
 
+    def fullmatch(self, text: str) -> Optional[str]:
+        for mre in self._mres:
+            m = mre.fullmatch(text)
+            if m:
+                return m.lastgroup
+        return None
+
 def _regexp_has_newline(r: str):
     r"""Expressions that may indicate newlines in a regexp:
         - newlines (\n)
@@ -409,20 +414,31 @@ class LexerState:
 
     __slots__ = 'text', 'line_ctr', 'last_token'
 
-    text: str
+    text: TextSlice
     line_ctr: LineCounter
     last_token: Optional[Token]
 
-    def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None):
+    def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None):
+        if line_ctr is None:
+            line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n')
+
+            if text.start > 0:
+                # Advance the line-count until line_ctr.char_pos == text.start
+                line_ctr.feed(TextSlice(text.text, 0, text.start))
+
+        if not (text.start <= line_ctr.char_pos <= text.end):
+            raise ValueError("LineCounter.char_pos is out of bounds")
+
         self.text = text
-        self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n')
+        self.line_ctr = line_ctr
         self.last_token = last_token
 
+
     def __eq__(self, other):
         if not isinstance(other, LexerState):
             return NotImplemented
 
-        return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
+        return self.text == other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
 
     def __copy__(self):
         return type(self)(self.text, copy(self.line_ctr), self.last_token)
@@ -432,15 +448,18 @@ class LexerThread:
     """A thread that ties a lexer instance and a lexer state, to be used by the parser
     """
 
-    def __init__(self, lexer: 'Lexer', lexer_state: LexerState):
+    def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState]):
         self.lexer = lexer
         self.state = lexer_state
 
     @classmethod
-    def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread':
+    def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread':
+        text = TextSlice.cast_from(text_or_slice)
         return cls(lexer, LexerState(text))
 
     def lex(self, parser_state):
+        if self.state is None:
+            raise TypeError("Cannot lex: No text assigned to lexer state")
         return self.lexer.lex(self.state, parser_state)
 
     def __copy__(self):
@@ -461,9 +480,9 @@ class Lexer(ABC):
     def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
         return NotImplemented
 
-    def make_lexer_state(self, text):
+    def make_lexer_state(self, text: str):
         "Deprecated"
-        return LexerState(text)
+        return LexerState(TextSlice.cast_from(text))
 
 
 def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
@@ -563,9 +582,9 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
         self.use_bytes = conf.use_bytes
         self.terminals_by_name = conf.terminals_by_name
 
-        self._scanner = None
+        self._scanner: Optional[Scanner] = None
 
-    def _build_scanner(self):
+    def _build_scanner(self) -> Scanner:
         terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
         assert all(self.callback.values())
 
@@ -576,26 +595,26 @@ def _build_scanner(self):
             else:
                 self.callback[type_] = f
 
-        self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
+        return Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
 
     @property
-    def scanner(self):
+    def scanner(self) -> Scanner:
         if self._scanner is None:
-            self._build_scanner()
+            self._scanner = self._build_scanner()
         return self._scanner
 
     def match(self, text, pos):
         return self.scanner.match(text, pos)
 
     def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
         line_ctr = lex_state.line_ctr
-        while line_ctr.char_pos < len(lex_state.text):
+        while line_ctr.char_pos < lex_state.text.end:
             res = self.match(lex_state.text, line_ctr.char_pos)
             if not res:
                 allowed = self.scanner.allowed_types - self.ignore_types
                 if not allowed:
                     allowed = {"<END-OF-FILE>"}
-                raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
+                raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
                                            allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
                                            state=parser_state, terminals_by_name=self.terminals_by_name)
 
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
@@ -1,7 +1,7 @@
 from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
 
 from .exceptions import ConfigurationError, GrammarError, assert_config
-from .utils import get_regexp_width, Serialize
+from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice
 from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
 from .parsers import earley, xearley, cyk
 from .parsers.lalr_parser import LALR_Parser
@@ -15,16 +15,31 @@
 ###{standalone
 
 def _wrap_lexer(lexer_class):
-    future_interface = getattr(lexer_class, '__future_interface__', False)
-    if future_interface:
+    future_interface = getattr(lexer_class, '__future_interface__', 0)
+    if future_interface == 2:
         return lexer_class
-    else:
-        class CustomLexerWrapper(Lexer):
+    elif future_interface == 1:
+        class CustomLexerWrapper1(Lexer):
+            def __init__(self, lexer_conf):
+                self.lexer = lexer_class(lexer_conf)
+            def lex(self, lexer_state, parser_state):
+                if not lexer_state.text.is_complete_text():
+                    raise TypeError("Interface=1 Custom Lexer don't support TextSlice")
+                lexer_state.text = lexer_state.text
+                return self.lexer.lex(lexer_state, parser_state)
+        return CustomLexerWrapper1
+    elif future_interface == 0:
+        class CustomLexerWrapper0(Lexer):
             def __init__(self, lexer_conf):
                 self.lexer = lexer_class(lexer_conf)
+
             def lex(self, lexer_state, parser_state):
-                return self.lexer.lex(lexer_state.text)
-        return CustomLexerWrapper
+                if not lexer_state.text.is_complete_text():
+                    raise TypeError("Interface=0 Custom Lexer don't support TextSlice")
+                return self.lexer.lex(lexer_state.text.text)
+        return CustomLexerWrapper0
+    else:
+        raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected")
 
 
 def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options):
@@ -93,23 +108,27 @@ def _verify_start(self, start=None):
             raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
         return start
 
-    def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]:
+    def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]:
         cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
-        return text if self.skip_lexer else cls.from_text(self.lexer, text)
+        return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text)
+
+    def parse(self, text: Optional[TextOrSlice], start=None, on_error=None):
+        if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):
+            if isinstance(text, TextSlice) and not text.is_complete_text():
+                raise TypeError(f"Lexer {self.lexer_conf.lexer_type} does not support text slices.")
 
-    def parse(self, text: str, start=None, on_error=None):
         chosen_start = self._verify_start(start)
         kw = {} if on_error is None else {'on_error': on_error}
         stream = self._make_lexer_thread(text)
         return self.parser.parse(stream, chosen_start, **kw)
 
-    def parse_interactive(self, text: Optional[str]=None, start=None):
+    def parse_interactive(self, text: Optional[TextOrSlice]=None, start=None):
         # TODO BREAK - Change text from Optional[str] to text: str = ''.
         #   Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return []
         chosen_start = self._verify_start(start)
         if self.parser_conf.parser_type != 'lalr':
             raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
-        stream = self._make_lexer_thread(text)  # type: ignore[arg-type]
+        stream = self._make_lexer_thread(text)
         return self.parser.parse_interactive(stream, chosen_start)
 
 
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
@@ -55,7 +55,7 @@ def parse(self, lexer, start, on_error=None):
                 if isinstance(e, UnexpectedCharacters):
                     # If user didn't change the character position, then we should
                     if p == s.line_ctr.char_pos:
-                        s.line_ctr.feed(s.text[p:p+1])
+                        s.line_ctr.feed(s.text.text[p:p+1])
 
                 try:
                     return e.interactive_parser.resume_parse()
diff --git a/lark/utils.py b/lark/utils.py
diff --git a/tests/test_lexer.py b/tests/test_lexer.py
diff --git a/tests/test_parser.py b/tests/test_parser.py