Skip to content

Commit bab7bd4

Browse files
committed
Now using unicodedb and graphemes to print errors properly!, now parsing \whitespaces+ properly
1 parent 0c13369 commit bab7bd4

File tree

4 files changed

+107
-53
lines changed

4 files changed

+107
-53
lines changed

kdl.nimble

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ skipFiles = @["src/kdl/query.nim", "src/kdl/schema.nim"]
1010
# Dependencies
1111

1212
requires "nim >= 1.6.0"
13+
# For proper unicode handling when printing errors
14+
requires "graphemes == 0.12.0"
15+
requires "unicodedb == 0.13.0"
1316

1417
task docs, "Generate documentation":
1518
# We create the prefs module documentation separately because it is not imported in the main kdl file as it's not backed:js friendly

src/kdl/lexer.nim

Lines changed: 50 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,6 @@ type
5454
const
5555
nonIdenChars = {'\\', '/', '(', ')', '{', '}', '<', '>', ';', '[', ']', '=', ',', '"'}
5656
nonInitialChars = Digits + nonIdenChars
57-
whitespaces = [
58-
0x0009, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
59-
0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x205F, 0x3000,
60-
]
6157
equals = [0x003D, 0xFE66, 0xFF1D, 0x1F7F0]
6258
litMatches = {
6359
"*": tkStar,
@@ -84,9 +80,9 @@ const
8480
proc `$`*(lexer: Lexer): string =
8581
result =
8682
if lexer.isStream:
87-
&"{(if lexer.stream.atEnd: \"SUCCESS\" else: \"FAIL\")}\n\t"
83+
&"{(if lexer.stream.atEnd: \"SUCCESS\" else: \"FAIL\")}\n "
8884
else:
89-
&"{(if lexer.current == lexer.source.len: \"SUCCESS\" else: \"FAIL\")} {lexer.current}/{lexer.source.len}\n\t"
85+
&"{(if lexer.current == lexer.source.len: \"SUCCESS\" else: \"FAIL\")} {lexer.current}/{lexer.source.len}\n "
9086
9187
for token in lexer.stack:
9288
result.add &"({token.kind})"
@@ -257,12 +253,51 @@ proc disallowedRunes() {.lexing: tkEmpty.} =
257253
elif isDisallowedRune(r):
258254
lexer.error &"The code point U+{r.toHex(4)} isn't allowed on a KDL document"
259255
256+
proc tokenMultiLineComment*() {.lexing: tkEmpty.} =
257+
if not lexer.peek("/*"):
258+
return
259+
260+
lexer.inc 2
261+
262+
var nested = 1
263+
264+
while not lexer.eof() and nested > 0:
265+
if lexer.peek("*/"):
266+
dec nested
267+
lexer.inc 2
268+
elif lexer.peek("/*"):
269+
inc nested
270+
lexer.inc 2
271+
else:
272+
inc lexer
273+
274+
if nested > 0:
275+
lexer.error "Expected end of multi-line comment"
276+
260277
proc tokenNewLine*() {.lexing: tkNewLine.} =
261278
for nl in newLines:
262279
if lexer.peek(nl):
263280
lexer.inc nl.len
264281
break
265282
283+
proc tokenWhitespace*() {.lexing: tkWhitespace.} =
284+
## This treats multline comments as whitespaces
285+
if not lexer.eof() and (let rune = lexer.peekRune(); rune.int in whitespaces):
286+
lexer.inc rune.size
287+
else:
288+
lexer.tokenMultiLineComment()
289+
290+
proc skipWhitespaceOrNewline*() {.lexing: tkEmpty.} =
291+
if not lexer.eof():
292+
if (let rune = lexer.peekRune(); rune.int in whitespaces):
293+
lexer.inc rune.size
294+
else:
295+
lexer.tokenNewLine(addToStack = false)
296+
297+
proc skipWhitespaces*() {.lexing: tkEmpty.} =
298+
while lexer.tokenWhitespace(addToStack = addToStack, consume = consume):
299+
discard
300+
266301
proc tokenNumWhole() {.lexing: tkEmpty.} =
267302
if lexer.peek() in {'-', '+'}:
268303
inc lexer
@@ -368,6 +403,7 @@ proc tokenStringBody(lexer: var Lexer, raw = false) =
368403
369404
while not lexer.eof():
370405
lexer.disallowedRunes()
406+
371407
let before = lexer.getPos()
372408
if lexer.tokenNewLine(addToStack = false):
373409
lexer.multilineStringsNewLines.add((before, lexer.getPos() - before))
@@ -380,11 +416,18 @@ proc tokenStringBody(lexer: var Lexer, raw = false) =
380416
inc lexer
381417
continue
382418
419+
lexer.inc
420+
421+
if lexer.skipWhitespaceOrNewline():
422+
while lexer.skipWhitespaceOrNewline():
423+
discard
424+
continue
425+
383426
let next = lexer.peek(1)
384427
if next notin escapeTable and next != 'u':
385428
lexer.error &"Invalid escape '{next}'"
386429
387-
lexer.inc 2
430+
lexer.inc
388431
389432
if next == 'u':
390433
if lexer.peek() != '{':
@@ -419,37 +462,6 @@ proc tokenString*() {.lexing: tkString.} =
419462
proc tokenRawString*() {.lexing: tkRawString.} =
420463
lexer.tokenStringBody(raw = true)
421464
422-
proc tokenMultiLineComment*() {.lexing: tkEmpty.} =
423-
if not lexer.peek("/*"):
424-
return
425-
426-
lexer.inc 2
427-
428-
var nested = 1
429-
430-
while not lexer.eof() and nested > 0:
431-
if lexer.peek("*/"):
432-
dec nested
433-
lexer.inc 2
434-
elif lexer.peek("/*"):
435-
inc nested
436-
lexer.inc 2
437-
else:
438-
inc lexer
439-
440-
if nested > 0:
441-
lexer.error "Expected end of multi-line comment"
442-
443-
proc tokenWhitespace*() {.lexing: tkWhitespace.} =
444-
if not lexer.eof() and (let rune = lexer.peekRune(); rune.int in whitespaces):
445-
lexer.inc rune.size
446-
else:
447-
lexer.tokenMultiLineComment()
448-
449-
proc skipWhitespaces*() {.lexing: tkEmpty.} =
450-
while lexer.tokenWhitespace():
451-
discard
452-
453465
proc tokenIdent*() {.lexing: tkIdent.} =
454466
if lexer.eof() or lexer.peek() in nonInitialChars:
455467
return

src/kdl/parser.nim

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,22 +168,45 @@ proc parseNumber(token: Token): KdlVal =
168168
result = initKFloat()
169169
result.fnum = token.lexeme.parseFloat()
170170
171+
proc continuesWithNewLine(s: string, at: var int, consume = true): bool =
172+
## Checks if there's a new line in s at at and increments at by the lenght
173+
## of the new line if consume is true
174+
for nl in newLines:
175+
if s.continuesWith(nl, at):
176+
if consume:
177+
at.inc nl.len
178+
return true
179+
180+
proc continuesWithWhitespace(s: string, at: var int, consume = true): bool =
181+
## Checks if there's a whitespace in s at at and increments at by the lenght
182+
## of the whitespace if consume is true
183+
for w in whitespaces:
184+
if s.continuesWith($Rune(w), at):
185+
if consume:
186+
at.inc w.Rune.size
187+
return true
188+
171189
proc escapeString(str: string, x = 0 .. str.high): string =
172190
var i = x.a
173191
while i <= x.b:
174192
if str[i] == '\\':
175193
inc i # Consume backslash
176-
if str[i] == 'u':
194+
195+
if str.continuesWithNewLine(i) or str.continuesWithWhitespace(i):
196+
while str.continuesWithNewLine(i) or str.continuesWithWhitespace(i):
197+
discard
198+
elif str[i] == 'u':
177199
inc i, 2 # Consume u and opening {
178200
var hex: string
179201
inc i, str.parseWhile(hex, HexDigits, i)
180202
result.add Rune(parseHexInt(hex))
203+
inc i
181204
else:
182205
result.add escapeTable[str[i]]
206+
inc i
183207
else:
184208
result.add str[i]
185-
186-
inc i
209+
inc i
187210
188211
proc parseString(token: Token, multilineStringsNewLines: seq[(int, int)]): KdlVal =
189212
assert token.kind in strings
@@ -224,12 +247,16 @@ proc parseNull(token: Token): KdlVal =
224247
assert token.kind == tkNull
225248
initKNull()
226249

250+
# TODO: don't parse identifier/string twice to know if it's a prop or a value
251+
# should save a temp parsed value and then use it
252+
227253
proc parseValue(token: Token, multilineStringsNewLines: seq[(int, int)]): KdlVal =
228254
result =
229255
case token.kind
230256
of numbers:
231257
token.parseNumber()
232258
of strings:
259+
echo getStackTrace()
233260
token.parseString(multilineStringsNewLines)
234261
of tkBool:
235262
token.parseBool()
@@ -243,6 +270,7 @@ proc parseIdent(
243270
): Option[string] =
244271
case token.kind
245272
of strings:
273+
echo getStackTrace()
246274
token.parseString(multilineStringsNewLines).getString().some
247275
of tkIdent:
248276
token.lexeme.some

src/kdl/utils.nim

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
## Various utilities for internal use in the library.
22
import std/[strformat, strutils, unicode, streams, tables, macros, sets]
3+
import unicodedb/widths
4+
import graphemes
35

46
import types
57

68
type
79
Coord* = object
810
line*, idx*: int
911
col*: int # col counts each unicode character (rune) as one
10-
colNonAscii*: int # number of non ascii (unicode chars) until col
11-
# this is a weird way of dealing with non-fixed width characters until an apporach
12-
# lice rust's unicode-width appears...
1312

1413
Object* = (
1514
(object or tuple) and not KdlSome and not SomeTable and not List and not Value and
@@ -22,15 +21,19 @@ type
2221

2322
const
2423
newLines* = ["\c\l", "\r", "\n", "\u0085", "\f", "\u2028", "\u2029"]
24+
whitespaces* = [
25+
0x0009, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
26+
0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x205F, 0x3000,
27+
]
2528
escapeTable* = {
2629
'n': "\u000A", # Line Feed
2730
'r': "\u000D", # Carriage Return
2831
't': "\u0009", # Character Tabulation (Tab)
2932
'\\': "\u005C", # Reverse Solidus (Backslash)
30-
'/': "\u002F", # Solidus (Forwardslash)
3133
'"': "\u0022", # Quotation Mark (Double Quote)
3234
'b': "\u0008", # Backspace
3335
'f': "\u000C", # Form Feed
36+
's': "\u0020", # Space
3437
}.toTable
3538

3639
template fail*(msg: string) =
@@ -51,7 +54,7 @@ proc quoted*(x: string): string =
5154
var isEscape = false
5255
for k, v in escapeTable:
5356
# Don't escape forward slash
54-
if k != '/' and x.continuesWith(v, i):
57+
if x.continuesWith(v, i):
5558
result.add &"\\{k}"
5659
i.inc v.len
5760
isEscape = true
@@ -147,7 +150,6 @@ proc getCoord*(s: Stream, i: int): Coord =
147150
if (let str = s.peekStr(n.len); str == n):
148151
inc result.line
149152
result.col = 0
150-
result.colNonAscii = 0
151153
result.idx.inc n.len
152154
isNewLine = true
153155
s.setPosition(s.getPosition() + n.len)
@@ -156,8 +158,6 @@ proc getCoord*(s: Stream, i: int): Coord =
156158
let r = s.peekRune()
157159
inc result.col
158160
result.idx.inc r.size
159-
if r.size > 1:
160-
inc result.colNonAscii
161161
s.setPosition(s.getPosition() + r.size)
162162

163163
s.setPosition before
@@ -170,16 +170,13 @@ proc getCoord*(s: string, at: int): Coord =
170170
if s.continuesWith(n, i):
171171
inc result.line
172172
i.inc n.len
173-
result.colNonAscii = 0
174173
result.col = 0
175174
isNewLine = true
176175

177176
if not isNewLine:
178177
let r = s.runeAt(i)
179178
i.inc r.size
180179
inc result.col
181-
if r.size > 1:
182-
inc result.colNonAscii
183180

184181
result.idx = i
185182

@@ -209,6 +206,20 @@ proc escapeRunes(s: string, until: int): tuple[s: string, extraLen: int] =
209206

210207
e.inc
211208

209+
proc properWidth(s: string, until: int): int =
210+
## Calculate the unicode-aware width of s until the rune until
211+
var e = 0
212+
for c in s.graphemes:
213+
if e >= until:
214+
return
215+
216+
case c.runeAt(0).unicodeWidth()
217+
of uwdtFull, uwdtWide, uwdtAmbiguous:
218+
result += 2
219+
else:
220+
result += 1
221+
inc e
222+
212223
proc errorAt*(s: Stream or string, coord: Coord): string =
213224
when s is Stream:
214225
let before = s.getPosition()
@@ -223,7 +234,7 @@ proc errorAt*(s: Stream or string, coord: Coord): string =
223234

224235
let lineNum = &"{coord.line + 1} | "
225236
result.add &"{lineNum}{line}\n"
226-
result.add unicode.align("^", lineNum.len + coord.col + extraLen + coord.colNonAscii)
237+
result.add unicode.align("^", lineNum.len + line.properWidth(coord.col) - 1)
227238

228239
# ----- Object variants -----
229240

0 commit comments

Comments
 (0)