Skip to content

Commit 1640da5

Browse files
authored
Merge pull request #1452 from lark-parser/textslice
Added TextSlice; Lark can now parse/lex a text-slice
2 parents 9a12577 + c3893d8 commit 1640da5

File tree

9 files changed

+254
-60
lines changed

9 files changed

+254
-60
lines changed

docs/classes.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,8 @@ Indenter
9696

9797
.. autoclass:: lark.indenter.Indenter
9898
.. autoclass:: lark.indenter.PythonIndenter
99+
100+
TextSlice
101+
---------
102+
103+
.. autoclass:: lark.utils.TextSlice

lark/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from .lark import Lark
1212
from .lexer import Token
1313
from .tree import ParseTree, Tree
14-
from .utils import logger
14+
from .utils import logger, TextSlice
1515
from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
1616

1717
__version__: str = "1.2.2"
@@ -33,6 +33,7 @@
3333
"Discard",
3434
"Transformer",
3535
"Transformer_NonRecursive",
36+
"TextSlice",
3637
"Visitor",
3738
"v_args",
3839
)

lark/lark.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .parser_frontends import ParsingFrontend
1717

1818
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
19-
from .utils import Serialize, SerializeMemoizer, FS, logger
19+
from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice
2020
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
2121
from .tree import Tree
2222
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
@@ -598,7 +598,7 @@ def __repr__(self):
598598
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
599599

600600

601-
def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
601+
def lex(self, text: TextOrSlice, dont_ignore: bool=False) -> Iterator[Token]:
602602
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'
603603
604604
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
@@ -620,11 +620,11 @@ def get_terminal(self, name: str) -> TerminalDef:
620620
"""Get information about a terminal"""
621621
return self._terminals_dict[name]
622622

623-
def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
624-
"""Start an interactive parsing session.
623+
def parse_interactive(self, text: Optional[TextOrSlice]=None, start: Optional[str]=None) -> 'InteractiveParser':
624+
"""Start an interactive parsing session. Only works when parser='lalr'.
625625
626626
Parameters:
627-
text (str, optional): Text to be parsed. Required for ``resume_parse()``.
627+
text (TextOrSlice, optional): Text to be parsed. Required for ``resume_parse()``.
628628
start (str, optional): Start symbol
629629
630630
Returns:
@@ -634,13 +634,15 @@ def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None)
634634
"""
635635
return self.parser.parse_interactive(text, start=start)
636636

637-
def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
637+
def parse(self, text: TextOrSlice, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
638638
"""Parse the given text, according to the options provided.
639639
640640
Parameters:
641-
text (str): Text to be parsed.
641+
text (TextOrSlice): Text to be parsed, as `str` or `bytes`.
642+
TextSlice may also be used, but only when lexer='basic' or 'contextual'.
642643
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
643-
on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing.
644+
on_error (function, optional): if provided, will be called on UnexpectedInput error,
645+
with the exception as its argument. Return true to resume parsing, or false to raise the exception.
644646
LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.
645647
646648
Returns:

lark/lexer.py

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .common import LexerConf
1818
from .parsers.lalr_parser_state import ParserState
1919

20-
from .utils import classify, get_regexp_width, Serialize, logger
20+
from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice
2121
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
2222
from .grammar import TOKEN_DEFAULT_PRIORITY
2323

@@ -289,7 +289,7 @@ def __eq__(self, other):
289289

290290
return self.char_pos == other.char_pos and self.newline_char == other.newline_char
291291

292-
def feed(self, token: Token, test_newline=True):
292+
def feed(self, token: TextOrSlice, test_newline=True):
293293
"""Consume a token and calculate the new line & column.
294294
295295
As an optional optimization, set test_newline=False if token doesn't contain a newline.
@@ -305,13 +305,13 @@ def feed(self, token: Token, test_newline=True):
305305

306306

307307
class UnlessCallback:
308-
def __init__(self, scanner):
308+
def __init__(self, scanner: 'Scanner'):
309309
self.scanner = scanner
310310

311-
def __call__(self, t):
312-
res = self.scanner.match(t.value, 0)
313-
if res:
314-
_value, t.type = res
311+
def __call__(self, t: Token):
312+
res = self.scanner.fullmatch(t.value)
313+
if res is not None:
314+
t.type = res
315315
return t
316316

317317

@@ -347,19 +347,18 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
347347
if strtok.pattern.flags <= retok.pattern.flags:
348348
embedded_strs.add(strtok)
349349
if unless:
350-
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
350+
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, use_bytes=use_bytes))
351351

352352
new_terminals = [t for t in terminals if t not in embedded_strs]
353353
return new_terminals, callback
354354

355355

356356
class Scanner:
357-
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
357+
def __init__(self, terminals, g_regex_flags, re_, use_bytes):
358358
self.terminals = terminals
359359
self.g_regex_flags = g_regex_flags
360360
self.re_ = re_
361361
self.use_bytes = use_bytes
362-
self.match_whole = match_whole
363362

364363
self.allowed_types = {t.name for t in self.terminals}
365364

@@ -369,10 +368,9 @@ def _build_mres(self, terminals, max_size):
369368
# Python sets an unreasonable group limit (currently 100) in its re module
370369
# Worse, the only way to know we reached it is by catching an AssertionError!
371370
# This function recursively tries less and less groups until it's successful.
372-
postfix = '$' if self.match_whole else ''
373371
mres = []
374372
while terminals:
375-
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
373+
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp()) for t in terminals[:max_size])
376374
if self.use_bytes:
377375
pattern = pattern.encode('latin-1')
378376
try:
@@ -384,13 +382,20 @@ def _build_mres(self, terminals, max_size):
384382
terminals = terminals[max_size:]
385383
return mres
386384

387-
def match(self, text, pos):
385+
def match(self, text: TextSlice, pos):
388386
for mre in self._mres:
389-
m = mre.match(text, pos)
387+
m = mre.match(text.text, pos, text.end)
390388
if m:
391389
return m.group(0), m.lastgroup
392390

393391

392+
def fullmatch(self, text: str) -> Optional[str]:
393+
for mre in self._mres:
394+
m = mre.fullmatch(text)
395+
if m:
396+
return m.lastgroup
397+
return None
398+
394399
def _regexp_has_newline(r: str):
395400
r"""Expressions that may indicate newlines in a regexp:
396401
- newlines (\n)
@@ -409,20 +414,31 @@ class LexerState:
409414

410415
__slots__ = 'text', 'line_ctr', 'last_token'
411416

412-
text: str
417+
text: TextSlice
413418
line_ctr: LineCounter
414419
last_token: Optional[Token]
415420

416-
def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None):
421+
def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None):
422+
if line_ctr is None:
423+
line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n')
424+
425+
if text.start > 0:
426+
# Advance the line-count until line_ctr.char_pos == text.start
427+
line_ctr.feed(TextSlice(text.text, 0, text.start))
428+
429+
if not (text.start <= line_ctr.char_pos <= text.end):
430+
raise ValueError("LineCounter.char_pos is out of bounds")
431+
417432
self.text = text
418-
self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n')
433+
self.line_ctr = line_ctr
419434
self.last_token = last_token
420435

436+
421437
def __eq__(self, other):
422438
if not isinstance(other, LexerState):
423439
return NotImplemented
424440

425-
return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
441+
return self.text == other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
426442

427443
def __copy__(self):
428444
return type(self)(self.text, copy(self.line_ctr), self.last_token)
@@ -432,15 +448,18 @@ class LexerThread:
432448
"""A thread that ties a lexer instance and a lexer state, to be used by the parser
433449
"""
434450

435-
def __init__(self, lexer: 'Lexer', lexer_state: LexerState):
451+
def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState]):
436452
self.lexer = lexer
437453
self.state = lexer_state
438454

439455
@classmethod
440-
def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread':
456+
def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread':
457+
text = TextSlice.cast_from(text_or_slice)
441458
return cls(lexer, LexerState(text))
442459

443460
def lex(self, parser_state):
461+
if self.state is None:
462+
raise TypeError("Cannot lex: No text assigned to lexer state")
444463
return self.lexer.lex(self.state, parser_state)
445464

446465
def __copy__(self):
@@ -461,9 +480,9 @@ class Lexer(ABC):
461480
def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
462481
return NotImplemented
463482

464-
def make_lexer_state(self, text):
483+
def make_lexer_state(self, text: str):
465484
"Deprecated"
466-
return LexerState(text)
485+
return LexerState(TextSlice.cast_from(text))
467486

468487

469488
def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
@@ -563,9 +582,9 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
563582
self.use_bytes = conf.use_bytes
564583
self.terminals_by_name = conf.terminals_by_name
565584

566-
self._scanner = None
585+
self._scanner: Optional[Scanner] = None
567586

568-
def _build_scanner(self):
587+
def _build_scanner(self) -> Scanner:
569588
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
570589
assert all(self.callback.values())
571590

@@ -576,26 +595,26 @@ def _build_scanner(self):
576595
else:
577596
self.callback[type_] = f
578597

579-
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
598+
return Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
580599

581600
@property
582-
def scanner(self):
601+
def scanner(self) -> Scanner:
583602
if self._scanner is None:
584-
self._build_scanner()
603+
self._scanner = self._build_scanner()
585604
return self._scanner
586605

587606
def match(self, text, pos):
588607
return self.scanner.match(text, pos)
589608

590609
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
591610
line_ctr = lex_state.line_ctr
592-
while line_ctr.char_pos < len(lex_state.text):
611+
while line_ctr.char_pos < lex_state.text.end:
593612
res = self.match(lex_state.text, line_ctr.char_pos)
594613
if not res:
595614
allowed = self.scanner.allowed_types - self.ignore_types
596615
if not allowed:
597616
allowed = {"<END-OF-FILE>"}
598-
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
617+
raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
599618
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
600619
state=parser_state, terminals_by_name=self.terminals_by_name)
601620

lark/parser_frontends.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
22

33
from .exceptions import ConfigurationError, GrammarError, assert_config
4-
from .utils import get_regexp_width, Serialize
4+
from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice
55
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
66
from .parsers import earley, xearley, cyk
77
from .parsers.lalr_parser import LALR_Parser
@@ -15,16 +15,31 @@
1515
###{standalone
1616

1717
def _wrap_lexer(lexer_class):
18-
future_interface = getattr(lexer_class, '__future_interface__', False)
19-
if future_interface:
18+
future_interface = getattr(lexer_class, '__future_interface__', 0)
19+
if future_interface == 2:
2020
return lexer_class
21-
else:
22-
class CustomLexerWrapper(Lexer):
21+
elif future_interface == 1:
22+
class CustomLexerWrapper1(Lexer):
23+
def __init__(self, lexer_conf):
24+
self.lexer = lexer_class(lexer_conf)
25+
def lex(self, lexer_state, parser_state):
26+
if not lexer_state.text.is_complete_text():
27+
raise TypeError("Interface=1 Custom Lexer don't support TextSlice")
28+
lexer_state.text = lexer_state.text
29+
return self.lexer.lex(lexer_state, parser_state)
30+
return CustomLexerWrapper1
31+
elif future_interface == 0:
32+
class CustomLexerWrapper0(Lexer):
2333
def __init__(self, lexer_conf):
2434
self.lexer = lexer_class(lexer_conf)
35+
2536
def lex(self, lexer_state, parser_state):
26-
return self.lexer.lex(lexer_state.text)
27-
return CustomLexerWrapper
37+
if not lexer_state.text.is_complete_text():
38+
raise TypeError("Interface=0 Custom Lexer don't support TextSlice")
39+
return self.lexer.lex(lexer_state.text.text)
40+
return CustomLexerWrapper0
41+
else:
42+
raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected")
2843

2944

3045
def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options):
@@ -93,23 +108,27 @@ def _verify_start(self, start=None):
93108
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
94109
return start
95110

96-
def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]:
111+
def _make_lexer_thread(self, text: Optional[TextOrSlice]) -> Union[TextOrSlice, LexerThread, None]:
97112
cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
98-
return text if self.skip_lexer else cls.from_text(self.lexer, text)
113+
return text if self.skip_lexer else cls(self.lexer, None) if text is None else cls.from_text(self.lexer, text)
114+
115+
def parse(self, text: Optional[TextOrSlice], start=None, on_error=None):
116+
if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):
117+
if isinstance(text, TextSlice) and not text.is_complete_text():
118+
raise TypeError(f"Lexer {self.lexer_conf.lexer_type} does not support text slices.")
99119

100-
def parse(self, text: str, start=None, on_error=None):
101120
chosen_start = self._verify_start(start)
102121
kw = {} if on_error is None else {'on_error': on_error}
103122
stream = self._make_lexer_thread(text)
104123
return self.parser.parse(stream, chosen_start, **kw)
105124

106-
def parse_interactive(self, text: Optional[str]=None, start=None):
125+
def parse_interactive(self, text: Optional[TextOrSlice]=None, start=None):
107126
# TODO BREAK - Change text from Optional[str] to text: str = ''.
108127
# Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return []
109128
chosen_start = self._verify_start(start)
110129
if self.parser_conf.parser_type != 'lalr':
111130
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
112-
stream = self._make_lexer_thread(text) # type: ignore[arg-type]
131+
stream = self._make_lexer_thread(text)
113132
return self.parser.parse_interactive(stream, chosen_start)
114133

115134

lark/parsers/lalr_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def parse(self, lexer, start, on_error=None):
5555
if isinstance(e, UnexpectedCharacters):
5656
# If user didn't change the character position, then we should
5757
if p == s.line_ctr.char_pos:
58-
s.line_ctr.feed(s.text[p:p+1])
58+
s.line_ctr.feed(s.text.text[p:p+1])
5959

6060
try:
6161
return e.interactive_parser.resume_parse()

0 commit comments

Comments
 (0)