Now using unicodedb and graphemes to print errors properly!, now parsing \whitespaces+ properly

Patitotective · Patitotective · commit bab7bd4168ac · 2024-11-21T11:47:03.000-05:00
diff --git a/kdl.nimble b/kdl.nimble
@@ -10,6 +10,9 @@ skipFiles = @["src/kdl/query.nim", "src/kdl/schema.nim"]
 # Dependencies
 
 requires "nim >= 1.6.0"
+# For proper unicode handling when printing errors
+requires "graphemes == 0.12.0"
+requires "unicodedb == 0.13.0"
 
 task docs, "Generate documentation":
   # We create the prefs module documentation separately because it is not imported in the main kdl file as it's not backed:js friendly
diff --git a/src/kdl/lexer.nim b/src/kdl/lexer.nim
@@ -54,10 +54,6 @@ type
 const
   nonIdenChars = {'\\', '/', '(', ')', '{', '}', '<', '>', ';', '[', ']', '=', ',', '"'}
   nonInitialChars = Digits + nonIdenChars
-  whitespaces = [
-    0x0009, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
-    0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x205F, 0x3000,
-  ]
   equals = [0x003D, 0xFE66, 0xFF1D, 0x1F7F0]
   litMatches = {
     "*": tkStar,
@@ -84,9 +80,9 @@ const
 proc `$`*(lexer: Lexer): string =
   result =
     if lexer.isStream:
-      &"{(if lexer.stream.atEnd: \"SUCCESS\" else: \"FAIL\")}\n\t"
+      &"{(if lexer.stream.atEnd: \"SUCCESS\" else: \"FAIL\")}\n  "
     else:
-      &"{(if lexer.current == lexer.source.len: \"SUCCESS\" else: \"FAIL\")} {lexer.current}/{lexer.source.len}\n\t"
+      &"{(if lexer.current == lexer.source.len: \"SUCCESS\" else: \"FAIL\")} {lexer.current}/{lexer.source.len}\n  "
 
   for token in lexer.stack:
     result.add &"({token.kind})"
@@ -257,12 +253,51 @@ proc disallowedRunes() {.lexing: tkEmpty.} =
   elif isDisallowedRune(r):
     lexer.error &"The code point U+{r.toHex(4)} isn't allowed on a KDL document"
 
+proc tokenMultiLineComment*() {.lexing: tkEmpty.} =
+  if not lexer.peek("/*"):
+    return
+
+  lexer.inc 2
+
+  var nested = 1
+
+  while not lexer.eof() and nested > 0:
+    if lexer.peek("*/"):
+      dec nested
+      lexer.inc 2
+    elif lexer.peek("/*"):
+      inc nested
+      lexer.inc 2
+    else:
+      inc lexer
+
+  if nested > 0:
+    lexer.error "Expected end of multi-line comment"
+
 proc tokenNewLine*() {.lexing: tkNewLine.} =
   for nl in newLines:
     if lexer.peek(nl):
       lexer.inc nl.len
       break
 
+proc tokenWhitespace*() {.lexing: tkWhitespace.} =
+  ## This treats multline comments as whitespaces
+  if not lexer.eof() and (let rune = lexer.peekRune(); rune.int in whitespaces):
+    lexer.inc rune.size
+  else:
+    lexer.tokenMultiLineComment()
+
+proc skipWhitespaceOrNewline*() {.lexing: tkEmpty.} =
+  if not lexer.eof():
+    if (let rune = lexer.peekRune(); rune.int in whitespaces):
+      lexer.inc rune.size
+    else:
+      lexer.tokenNewLine(addToStack = false)
+
+proc skipWhitespaces*() {.lexing: tkEmpty.} =
+  while lexer.tokenWhitespace(addToStack = addToStack, consume = consume):
+    discard
+
 proc tokenNumWhole() {.lexing: tkEmpty.} =
   if lexer.peek() in {'-', '+'}:
     inc lexer
@@ -368,6 +403,7 @@ proc tokenStringBody(lexer: var Lexer, raw = false) =
 
   while not lexer.eof():
     lexer.disallowedRunes()
+
     let before = lexer.getPos()
     if lexer.tokenNewLine(addToStack = false):
       lexer.multilineStringsNewLines.add((before, lexer.getPos() - before))
@@ -380,11 +416,18 @@ proc tokenStringBody(lexer: var Lexer, raw = false) =
         inc lexer
         continue
 
+      lexer.inc
+
+      if lexer.skipWhitespaceOrNewline():
+        while lexer.skipWhitespaceOrNewline():
+          discard
+        continue
+
       let next = lexer.peek(1)
       if next notin escapeTable and next != 'u':
         lexer.error &"Invalid escape '{next}'"
 
-      lexer.inc 2
+      lexer.inc
 
       if next == 'u':
         if lexer.peek() != '{':
@@ -419,37 +462,6 @@ proc tokenString*() {.lexing: tkString.} =
 proc tokenRawString*() {.lexing: tkRawString.} =
   lexer.tokenStringBody(raw = true)
 
-proc tokenMultiLineComment*() {.lexing: tkEmpty.} =
-  if not lexer.peek("/*"):
-    return
-
-  lexer.inc 2
-
-  var nested = 1
-
-  while not lexer.eof() and nested > 0:
-    if lexer.peek("*/"):
-      dec nested
-      lexer.inc 2
-    elif lexer.peek("/*"):
-      inc nested
-      lexer.inc 2
-    else:
-      inc lexer
-
-  if nested > 0:
-    lexer.error "Expected end of multi-line comment"
-
-proc tokenWhitespace*() {.lexing: tkWhitespace.} =
-  if not lexer.eof() and (let rune = lexer.peekRune(); rune.int in whitespaces):
-    lexer.inc rune.size
-  else:
-    lexer.tokenMultiLineComment()
-
-proc skipWhitespaces*() {.lexing: tkEmpty.} =
-  while lexer.tokenWhitespace():
-    discard
-
 proc tokenIdent*() {.lexing: tkIdent.} =
   if lexer.eof() or lexer.peek() in nonInitialChars:
     return
diff --git a/src/kdl/parser.nim b/src/kdl/parser.nim
@@ -168,22 +168,45 @@ proc parseNumber(token: Token): KdlVal =
     result = initKFloat()
     result.fnum = token.lexeme.parseFloat()
 
+proc continuesWithNewLine(s: string, at: var int, consume = true): bool =
+  ## Checks if there's a new line in s at at and increments at by the lenght
+  ## of the new line if consume is true
+  for nl in newLines:
+    if s.continuesWith(nl, at):
+      if consume:
+        at.inc nl.len
+      return true
+
+proc continuesWithWhitespace(s: string, at: var int, consume = true): bool =
+  ## Checks if there's a whitespace in s at at and increments at by the lenght
+  ## of the whitespace if consume is true
+  for w in whitespaces:
+    if s.continuesWith($Rune(w), at):
+      if consume:
+        at.inc w.Rune.size
+      return true
+
 proc escapeString(str: string, x = 0 .. str.high): string =
   var i = x.a
   while i <= x.b:
     if str[i] == '\\':
       inc i # Consume backslash
-      if str[i] == 'u':
+
+      if str.continuesWithNewLine(i) or str.continuesWithWhitespace(i):
+        while str.continuesWithNewLine(i) or str.continuesWithWhitespace(i):
+          discard
+      elif str[i] == 'u':
         inc i, 2 # Consume u and opening {
         var hex: string
         inc i, str.parseWhile(hex, HexDigits, i)
         result.add Rune(parseHexInt(hex))
+        inc i
       else:
         result.add escapeTable[str[i]]
+        inc i
     else:
       result.add str[i]
-
-    inc i
+      inc i
 
 proc parseString(token: Token, multilineStringsNewLines: seq[(int, int)]): KdlVal =
   assert token.kind in strings
@@ -224,12 +247,16 @@ proc parseNull(token: Token): KdlVal =
   assert token.kind == tkNull
   initKNull()
 
+# TODO: don't parse identifier/string twice to know if it's a prop or a value
+# should save a temp parsed value and then use it
+
 proc parseValue(token: Token, multilineStringsNewLines: seq[(int, int)]): KdlVal =
   result =
     case token.kind
     of numbers:
       token.parseNumber()
     of strings:
+      echo getStackTrace()
       token.parseString(multilineStringsNewLines)
     of tkBool:
       token.parseBool()
@@ -243,6 +270,7 @@ proc parseIdent(
 ): Option[string] =
   case token.kind
   of strings:
+    echo getStackTrace()
     token.parseString(multilineStringsNewLines).getString().some
   of tkIdent:
     token.lexeme.some
diff --git a/src/kdl/utils.nim b/src/kdl/utils.nim
@@ -1,15 +1,14 @@
 ## Various utilities for internal use in the library.
 import std/[strformat, strutils, unicode, streams, tables, macros, sets]
+import unicodedb/widths
+import graphemes
 
 import types
 
 type
   Coord* = object
     line*, idx*: int
     col*: int # col counts each unicode character (rune) as one
-    colNonAscii*: int # number of non ascii (unicode chars) until col
-    # this is a weird way of dealing with non-fixed width characters until an apporach
-    # lice rust's unicode-width appears...
 
   Object* = (
     (object or tuple) and not KdlSome and not SomeTable and not List and not Value and
@@ -22,15 +21,19 @@ type
 
 const
   newLines* = ["\c\l", "\r", "\n", "\u0085", "\f", "\u2028", "\u2029"]
+  whitespaces* = [
+    0x0009, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
+    0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x205F, 0x3000,
+  ]
   escapeTable* = {
     'n': "\u000A", # Line Feed
     'r': "\u000D", # Carriage Return
     't': "\u0009", # Character Tabulation (Tab)
     '\\': "\u005C", # Reverse Solidus (Backslash)
-    '/': "\u002F", # Solidus (Forwardslash)
     '"': "\u0022", # Quotation Mark (Double Quote)
     'b': "\u0008", # Backspace
     'f': "\u000C", # Form Feed
+    's': "\u0020", # Space
   }.toTable
 
 template fail*(msg: string) =
@@ -51,7 +54,7 @@ proc quoted*(x: string): string =
     var isEscape = false
     for k, v in escapeTable:
       # Don't escape forward slash
-      if k != '/' and x.continuesWith(v, i):
+      if x.continuesWith(v, i):
         result.add &"\\{k}"
         i.inc v.len
         isEscape = true
@@ -147,7 +150,6 @@ proc getCoord*(s: Stream, i: int): Coord =
       if (let str = s.peekStr(n.len); str == n):
         inc result.line
         result.col = 0
-        result.colNonAscii = 0
         result.idx.inc n.len
         isNewLine = true
         s.setPosition(s.getPosition() + n.len)
@@ -156,8 +158,6 @@ proc getCoord*(s: Stream, i: int): Coord =
       let r = s.peekRune()
       inc result.col
       result.idx.inc r.size
-      if r.size > 1:
-        inc result.colNonAscii
       s.setPosition(s.getPosition() + r.size)
 
   s.setPosition before
@@ -170,16 +170,13 @@ proc getCoord*(s: string, at: int): Coord =
       if s.continuesWith(n, i):
         inc result.line
         i.inc n.len
-        result.colNonAscii = 0
         result.col = 0
         isNewLine = true
 
     if not isNewLine:
       let r = s.runeAt(i)
       i.inc r.size
       inc result.col
-      if r.size > 1:
-        inc result.colNonAscii
 
   result.idx = i
 
@@ -209,6 +206,20 @@ proc escapeRunes(s: string, until: int): tuple[s: string, extraLen: int] =
 
     e.inc
 
+proc properWidth(s: string, until: int): int =
+  ## Calculate the unicode-aware width of s until the rune until
+  var e = 0
+  for c in s.graphemes:
+    if e >= until:
+      return
+
+    case c.runeAt(0).unicodeWidth()
+    of uwdtFull, uwdtWide, uwdtAmbiguous:
+      result += 2
+    else:
+      result += 1
+    inc e
+
 proc errorAt*(s: Stream or string, coord: Coord): string =
   when s is Stream:
     let before = s.getPosition()
@@ -223,7 +234,7 @@ proc errorAt*(s: Stream or string, coord: Coord): string =
 
   let lineNum = &"{coord.line + 1} | "
   result.add &"{lineNum}{line}\n"
-  result.add unicode.align("^", lineNum.len + coord.col + extraLen + coord.colNonAscii)
+  result.add unicode.align("^", lineNum.len + line.properWidth(coord.col) - 1)
 
 # ----- Object variants -----