Skip to content

Commit 9ce40fe

Browse files
committed
Added TextSlice; Lark can now parse/lex a text-slice
Based on previous PR by MegaIng
1 parent 821f3c1 commit 9ce40fe

File tree

8 files changed

+185
-40
lines changed

8 files changed

+185
-40
lines changed

lark/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from .lark import Lark
1212
from .lexer import Token
1313
from .tree import ParseTree, Tree
14-
from .utils import logger
14+
from .utils import logger, TextSlice
1515
from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
1616

1717
__version__: str = "1.2.2"
@@ -33,6 +33,7 @@
3333
"Discard",
3434
"Transformer",
3535
"Transformer_NonRecursive",
36+
"TextSlice",
3637
"Visitor",
3738
"v_args",
3839
)

lark/lark.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .parser_frontends import ParsingFrontend
1717

1818
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
19-
from .utils import Serialize, SerializeMemoizer, FS, logger
19+
from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice
2020
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
2121
from .tree import Tree
2222
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
@@ -598,7 +598,7 @@ def __repr__(self):
598598
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
599599

600600

601-
def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
601+
def lex(self, text: TextOrSlice, dont_ignore: bool=False) -> Iterator[Token]:
602602
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'
603603
604604
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
@@ -620,7 +620,7 @@ def get_terminal(self, name: str) -> TerminalDef:
620620
"""Get information about a terminal"""
621621
return self._terminals_dict[name]
622622

623-
def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
623+
def parse_interactive(self, text: Optional[TextOrSlice]=None, start: Optional[str]=None) -> 'InteractiveParser':
624624
"""Start an interactive parsing session.
625625
626626
Parameters:
@@ -634,7 +634,7 @@ def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None)
634634
"""
635635
return self.parser.parse_interactive(text, start=start)
636636

637-
def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
637+
def parse(self, text: TextOrSlice, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
638638
"""Parse the given text, according to the options provided.
639639
640640
Parameters:

lark/lexer.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .common import LexerConf
1818
from .parsers.lalr_parser_state import ParserState
1919

20-
from .utils import classify, get_regexp_width, Serialize, logger
20+
from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice
2121
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
2222
from .grammar import TOKEN_DEFAULT_PRIORITY
2323

@@ -289,7 +289,7 @@ def __eq__(self, other):
289289

290290
return self.char_pos == other.char_pos and self.newline_char == other.newline_char
291291

292-
def feed(self, token: Token, test_newline=True):
292+
def feed(self, token: TextOrSlice, test_newline=True):
293293
"""Consume a token and calculate the new line & column.
294294
295295
As an optional optimization, set test_newline=False if token doesn't contain a newline.
@@ -382,9 +382,9 @@ def _build_mres(self, terminals, max_size):
382382
terminals = terminals[max_size:]
383383
return mres
384384

385-
def match(self, text, pos):
385+
def match(self, text: TextSlice, pos):
386386
for mre in self._mres:
387-
m = mre.match(text, pos)
387+
m = mre.match(text.text, pos, text.end)
388388
if m:
389389
return m.group(0), m.lastgroup
390390

@@ -394,6 +394,7 @@ def fullmatch(self, text: str) -> Optional[str]:
394394
m = mre.fullmatch(text)
395395
if m:
396396
return m.lastgroup
397+
return None
397398

398399
def _regexp_has_newline(r: str):
399400
r"""Expressions that may indicate newlines in a regexp:
@@ -413,20 +414,31 @@ class LexerState:
413414

414415
__slots__ = 'text', 'line_ctr', 'last_token'
415416

416-
text: str
417+
text: TextSlice
417418
line_ctr: LineCounter
418419
last_token: Optional[Token]
419420

420-
def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None):
421+
def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None):
422+
if line_ctr is None:
423+
line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n')
424+
425+
if text.start > 0:
426+
# Advance the line-count until line_ctr.char_pos == text.start
427+
line_ctr.feed(TextSlice(text.text, 0, text.start))
428+
429+
if not (text.start <= line_ctr.char_pos <= text.end):
430+
raise ValueError("LineCounter.char_pos is out of bounds")
431+
421432
self.text = text
422-
self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n')
433+
self.line_ctr = line_ctr
423434
self.last_token = last_token
424435

436+
425437
def __eq__(self, other):
426438
if not isinstance(other, LexerState):
427439
return NotImplemented
428440

429-
return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
441+
return self.text == other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
430442

431443
def __copy__(self):
432444
return type(self)(self.text, copy(self.line_ctr), self.last_token)
@@ -436,15 +448,18 @@ class LexerThread:
436448
"""A thread that ties a lexer instance and a lexer state, to be used by the parser
437449
"""
438450

439-
def __init__(self, lexer: 'Lexer', lexer_state: LexerState):
451+
def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState]):
440452
self.lexer = lexer
441453
self.state = lexer_state
442454

443455
@classmethod
444-
def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread':
456+
def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread':
457+
text = TextSlice.cast_from(text_or_slice)
445458
return cls(lexer, LexerState(text))
446459

447460
def lex(self, parser_state):
461+
if self.state is None:
462+
raise TypeError("Cannot lex: No text assigned to lexer state")
448463
return self.lexer.lex(self.state, parser_state)
449464

450465
def __copy__(self):
@@ -465,9 +480,9 @@ class Lexer(ABC):
465480
def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
466481
return NotImplemented
467482

468-
def make_lexer_state(self, text):
483+
def make_lexer_state(self, text: str):
469484
"Deprecated"
470-
return LexerState(text)
485+
return LexerState(TextSlice.cast_from(text))
471486

472487

473488
def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
@@ -567,9 +582,9 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
567582
self.use_bytes = conf.use_bytes
568583
self.terminals_by_name = conf.terminals_by_name
569584

570-
self._scanner = None
585+
self._scanner: Optional[Scanner] = None
571586

572-
def _build_scanner(self):
587+
def _build_scanner(self) -> Scanner:
573588
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
574589
assert all(self.callback.values())
575590

@@ -580,26 +595,26 @@ def _build_scanner(self):
580595
else:
581596
self.callback[type_] = f
582597

583-
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
598+
return Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
584599

585600
@property
586-
def scanner(self):
601+
def scanner(self) -> Scanner:
587602
if self._scanner is None:
588-
self._build_scanner()
603+
self._scanner = self._build_scanner()
589604
return self._scanner
590605

591606
def match(self, text, pos):
592607
return self.scanner.match(text, pos)
593608

594609
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
595610
line_ctr = lex_state.line_ctr
596-
while line_ctr.char_pos < len(lex_state.text):
611+
while line_ctr.char_pos < lex_state.text.end:
597612
res = self.match(lex_state.text, line_ctr.char_pos)
598613
if not res:
599614
allowed = self.scanner.allowed_types - self.ignore_types
600615
if not allowed:
601616
allowed = {"<END-OF-FILE>"}
602-
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
617+
raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
603618
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
604619
state=parser_state, terminals_by_name=self.terminals_by_name)
605620

lark/parser_frontends.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
22

33
from .exceptions import ConfigurationError, GrammarError, assert_config
4-
from .utils import get_regexp_width, Serialize
4+
from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice
55
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
66
from .parsers import earley, xearley, cyk
77
from .parsers.lalr_parser import LALR_Parser
@@ -15,16 +15,31 @@
1515
###{standalone
1616

1717
def _wrap_lexer(lexer_class):
18-
future_interface = getattr(lexer_class, '__future_interface__', False)
19-
if future_interface:
18+
future_interface = getattr(lexer_class, '__future_interface__', 0)
19+
if future_interface == 2:
2020
return lexer_class
21-
else:
22-
class CustomLexerWrapper(Lexer):
21+
elif future_interface == 1:
22+
class CustomLexerWrapper1(Lexer):
23+
def __init__(self, lexer_conf):
24+
self.lexer = lexer_class(lexer_conf)
25+
def lex(self, lexer_state, parser_state):
26+
if not lexer_state.text.is_complete_text():
27+
raise TypeError("Interface=1 Custom Lexer don't support TextSlice")
28+
lexer_state.text = lexer_state.text
29+
return self.lexer.lex(lexer_state, parser_state)
30+
return CustomLexerWrapper1
31+
elif future_interface == 0:
32+
class CustomLexerWrapper0(Lexer):
2333
def __init__(self, lexer_conf):
2434
self.lexer = lexer_class(lexer_conf)
35+
2536
def lex(self, lexer_state, parser_state):
26-
return self.lexer.lex(lexer_state.text)
27-
return CustomLexerWrapper
37+
if not lexer_state.text.is_complete_text():
38+
raise TypeError("Interface=0 Custom Lexer don't support TextSlice")
39+
return self.lexer.lex(lexer_state.text.text)
40+
return CustomLexerWrapper0
41+
else:
42+
raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected")
2843

2944

3045
def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options):
@@ -93,23 +108,27 @@ def _verify_start(self, start=None):
93108
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
94109
return start
95110

96-
def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]:
111+
def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]:
97112
cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
98-
return text if self.skip_lexer else cls.from_text(self.lexer, text)
113+
return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text)
114+
115+
def parse(self, text: Optional[TextOrSlice], start=None, on_error=None):
116+
if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):
117+
if isinstance(text, TextSlice) and not text.is_complete_text():
118+
raise TypeError(f"Lexer {self.lexer_conf.lexer_type} does not support text slices.")
99119

100-
def parse(self, text: str, start=None, on_error=None):
101120
chosen_start = self._verify_start(start)
102121
kw = {} if on_error is None else {'on_error': on_error}
103122
stream = self._make_lexer_thread(text)
104123
return self.parser.parse(stream, chosen_start, **kw)
105124

106-
def parse_interactive(self, text: Optional[str]=None, start=None):
125+
def parse_interactive(self, text: Optional[TextOrSlice]=None, start=None):
107126
# TODO BREAK - Change text from Optional[str] to text: str = ''.
108127
# Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return []
109128
chosen_start = self._verify_start(start)
110129
if self.parser_conf.parser_type != 'lalr':
111130
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
112-
stream = self._make_lexer_thread(text) # type: ignore[arg-type]
131+
stream = self._make_lexer_thread(text)
113132
return self.parser.parse_interactive(stream, chosen_start)
114133

115134

lark/parsers/lalr_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def parse(self, lexer, start, on_error=None):
5555
if isinstance(e, UnexpectedCharacters):
5656
# If user didn't change the character position, then we should
5757
if p == s.line_ctr.char_pos:
58-
s.line_ctr.feed(s.text[p:p+1])
58+
s.line_ctr.feed(s.text.text[p:p+1])
5959

6060
try:
6161
return e.interactive_parser.resume_parse()

lark/utils.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
###{standalone
88
import sys, re
99
import logging
10+
from dataclasses import dataclass
11+
from typing import Generic, AnyStr
1012

1113
logger: logging.Logger = logging.getLogger("lark")
1214
logger.addHandler(logging.StreamHandler())
@@ -158,6 +160,49 @@ def get_regexp_width(expr: str) -> Union[Tuple[int, int], List[int]]:
158160
else:
159161
return 0, int(MAXWIDTH)
160162

163+
164+
@dataclass(frozen=True)
165+
class TextSlice(Generic[AnyStr]):
166+
text: AnyStr
167+
start: int
168+
end: int
169+
170+
def __post_init__(self):
171+
if not isinstance(self.text, (str, bytes)):
172+
raise TypeError("text must be str or bytes")
173+
174+
if self.start < 0:
175+
object.__setattr__(self, 'start', self.start + len(self.text))
176+
assert self.start >=0
177+
178+
if self.end is None:
179+
object.__setattr__(self, 'end', len(self.text))
180+
elif self.end < 0:
181+
object.__setattr__(self, 'end', self.end + len(self.text))
182+
assert self.end <= len(self.text)
183+
184+
@classmethod
185+
def cast_from(cls, text: 'TextOrSlice') -> 'TextSlice[AnyStr]':
186+
if isinstance(text, TextSlice):
187+
return text
188+
189+
return cls(text, 0, len(text))
190+
191+
def is_complete_text(self):
192+
return self.start == 0 and self.end == len(self.text)
193+
194+
def __len__(self):
195+
return self.end - self.start
196+
197+
def count(self, substr: AnyStr):
198+
return self.text.count(substr, self.start, self.end)
199+
200+
def rindex(self, substr: AnyStr):
201+
return self.text.rindex(substr, self.start, self.end)
202+
203+
204+
TextOrSlice = Union[AnyStr, 'TextSlice[AnyStr]']
205+
161206
###}
162207

163208

tests/test_lexer.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from unittest import TestCase, main
22

3-
from lark import Lark, Tree
3+
from lark import Lark, Tree, TextSlice
4+
45

56
class TestLexer(TestCase):
67
def setUp(self):
@@ -18,6 +19,18 @@ def test_basic(self):
1819
res = list(p.lex("abc cba dd", dont_ignore=True))
1920
assert res == list('abc cba dd')
2021

22+
def test_subset_lex(self):
23+
p = Lark("""
24+
start: "a" "b" "c" "d"
25+
%ignore " "
26+
""")
27+
28+
res = list(p.lex(TextSlice("xxxabc cba ddxx", 3, -2)))
29+
assert res == list('abccbadd')
30+
31+
res = list(p.lex(TextSlice("aaaabc cba dddd", 3, -2)))
32+
assert res == list('abccbadd')
33+
2134

2235
if __name__ == '__main__':
2336
main()

0 commit comments

Comments
 (0)