Fixes for special characters in field names

turukawa · turukawa · commit cc9a54119b0b · 2024-02-12T12:20:52.000+01:00
Fixes for where source field names include special characters (newlines / tabs) or characters used in scripts. As whyqd is used for more this may need thorough review.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -3,11 +3,15 @@ title: Change log
 summary: Version history, including for legacy versions.
 authors:
   - Gavin Chait
-date: 2023-12-12
+date: 2024-02-12
 tags: wrangling, crosswalks, versions
 ---
 # Change log
 
+## Version 1.1.1 (2024-02-12)
+
+- Fixes for where source field names include special characters (newlines / tabs) or characters used in scripts. As whyqd is used for more this may need thorough review.
+
 ## Version 1.1.0 (2023-12-12)
 
 - Fixes to tests
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "whyqd"
-version = "1.1.0"
+version = "1.1.1"
 description = "data wrangling simplicity, complete audit transparency, and at speed"
 authors = ["Gavin Chait <gchait@whythawk.com>"]
 license = "BSD-3-Clause"
diff --git a/whyqd/VERSION b/whyqd/VERSION
@@ -1 +1 @@
-1.1.0
+1.1.1
diff --git a/whyqd/parsers/action.py b/whyqd/parsers/action.py
@@ -127,7 +127,7 @@ def parse(
                     # replace the txt with hx
                     parsed_stack = parsed_stack.replace(f"[{txt}]", hx)
                 i_prsed = []
-                for s in self.parser.get_split_terms(script=parsed_stack, by=","):
+                for s in self.parser.get_split_terms(script=parsed_stack, by=",", maxsplit=-1):
                     splt = self.parser.get_split_terms(script=s, by="<")
                     if len(splt) == 1:
                         i_prsed.extend(self.parser.get_listed_literal(text=s))
@@ -293,6 +293,9 @@ def recover_fields_from_hexed_script(
         if not isinstance(parsed, list):
             parsed = [parsed]
         for term in parsed:
+            if not term:
+                # Blank string artifacts can be introduced
+                continue
             recovered = None
             if isinstance(term, str) and term in modifier_names:
                 recovered = action.get_modifier(term=term)
diff --git a/whyqd/parsers/category.py b/whyqd/parsers/category.py
@@ -185,7 +185,6 @@ def set_schema(
         self.schema_source = schema_source
         self.schema_destination = schema_destination
 
-
     def get_schema_field_category(self, *, field: FieldModel, term: str, is_source: bool = True) -> CategoryModel | None:
         """
         Recover a field category model from a string. It is possible that source and destination schema category share 
@@ -281,4 +280,4 @@ def get_assigned_uniques(self, *, text: str) -> list[str]:
         terms = list(self.parser.generate_contents(text=text))
         if len(terms) != 1:
             raise ValueError(f"Category assignment actions must not be nested. ({text}).")
-        return [self.parser.get_literal(text=t) for t in self.parser.get_split_terms(script=terms[0][1], by=",")]
+        return [self.parser.get_literal(text=t) for t in self.parser.get_split_terms(script=terms[0][1], by=",", maxsplit=-1)]
diff --git a/whyqd/parsers/morph.py b/whyqd/parsers/morph.py
@@ -206,4 +206,4 @@ def get_morph_struts(self, *, term: str) -> list[str]:
             return [term]
         if len(terms) != 1:
             raise ValueError(f"Morph actions must not be nested. ({term}).")
-        return self.parser.get_split_terms(script=terms[0][1], by=",")
+        return self.parser.get_split_terms(script=terms[0][1], by=",", maxsplit=-1)
diff --git a/whyqd/parsers/script.py b/whyqd/parsers/script.py
@@ -146,15 +146,21 @@ def generate_contents(self, *, text) -> list[tuple[int, str]]:
                 start = stack.pop()
                 yield (len(stack), text[start + 1 : i])
 
-    def get_split_terms(self, *, script: str, by: str) -> list[str]:
-        return [s.strip() for s in script.split(by)]
+    def get_split_terms(self, *, script: str, by: str, maxsplit: int = 1) -> list[str]:
+        # https://docs.python.org/3/library/stdtypes.html#str.split
+        # str.split(sep=None, maxsplit=-1)
+        return [s.strip() for s in script.split(sep=by, maxsplit=maxsplit)]
 
     def get_literal(self, *, text: str) -> str:
         literal = text
         try:
             literal = ast.literal_eval(text)
-        except ValueError:
+        except (ValueError, TypeError):
             pass
+        except SyntaxError:
+            # `literal_eval` strips special characters, leading to syntax errors
+            if text.startswith("'") and text.endswith("'"):
+                literal = text[1:-1]
         return literal
 
     def get_listed_literal(self, *, text: str) -> list[str]: