Skip to content

Commit cc9a541

Browse files
committed
Fixes for special characters in field names
Fixes for where source field names include special characters (newlines / tabs) or characters used in scripts. As whyqd is used for more this may need thorough review.
1 parent 7137e15 commit cc9a541

File tree

7 files changed

+22
-10
lines changed

7 files changed

+22
-10
lines changed

docs/changelog.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@ title: Change log
33
summary: Version history, including for legacy versions.
44
authors:
55
- Gavin Chait
6-
date: 2023-12-12
6+
date: 2024-02-12
77
tags: wrangling, crosswalks, versions
88
---
99
# Change log
1010

11+
## Version 1.1.1 (2024-02-12)
12+
13+
- Fixes for where source field names include special characters (newlines / tabs) or characters used in scripts. As whyqd is used for more this may need thorough review.
14+
1115
## Version 1.1.0 (2023-12-12)
1216

1317
- Fixes to tests

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "whyqd"
3-
version = "1.1.0"
3+
version = "1.1.1"
44
description = "data wrangling simplicity, complete audit transparency, and at speed"
55
authors = ["Gavin Chait <gchait@whythawk.com>"]
66
license = "BSD-3-Clause"

whyqd/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.1.0
1+
1.1.1

whyqd/parsers/action.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def parse(
127127
# replace the txt with hx
128128
parsed_stack = parsed_stack.replace(f"[{txt}]", hx)
129129
i_prsed = []
130-
for s in self.parser.get_split_terms(script=parsed_stack, by=","):
130+
for s in self.parser.get_split_terms(script=parsed_stack, by=",", maxsplit=-1):
131131
splt = self.parser.get_split_terms(script=s, by="<")
132132
if len(splt) == 1:
133133
i_prsed.extend(self.parser.get_listed_literal(text=s))
@@ -293,6 +293,9 @@ def recover_fields_from_hexed_script(
293293
if not isinstance(parsed, list):
294294
parsed = [parsed]
295295
for term in parsed:
296+
if not term:
297+
# Blank string artifacts can be introduced
298+
continue
296299
recovered = None
297300
if isinstance(term, str) and term in modifier_names:
298301
recovered = action.get_modifier(term=term)

whyqd/parsers/category.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,6 @@ def set_schema(
185185
self.schema_source = schema_source
186186
self.schema_destination = schema_destination
187187

188-
189188
def get_schema_field_category(self, *, field: FieldModel, term: str, is_source: bool = True) -> CategoryModel | None:
190189
"""
191190
Recover a field category model from a string. It is possible that source and destination schema category share
@@ -281,4 +280,4 @@ def get_assigned_uniques(self, *, text: str) -> list[str]:
281280
terms = list(self.parser.generate_contents(text=text))
282281
if len(terms) != 1:
283282
raise ValueError(f"Category assignment actions must not be nested. ({text}).")
284-
return [self.parser.get_literal(text=t) for t in self.parser.get_split_terms(script=terms[0][1], by=",")]
283+
return [self.parser.get_literal(text=t) for t in self.parser.get_split_terms(script=terms[0][1], by=",", maxsplit=-1)]

whyqd/parsers/morph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,4 +206,4 @@ def get_morph_struts(self, *, term: str) -> list[str]:
206206
return [term]
207207
if len(terms) != 1:
208208
raise ValueError(f"Morph actions must not be nested. ({term}).")
209-
return self.parser.get_split_terms(script=terms[0][1], by=",")
209+
return self.parser.get_split_terms(script=terms[0][1], by=",", maxsplit=-1)

whyqd/parsers/script.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,15 +146,21 @@ def generate_contents(self, *, text) -> list[tuple[int, str]]:
146146
start = stack.pop()
147147
yield (len(stack), text[start + 1 : i])
148148

149-
def get_split_terms(self, *, script: str, by: str) -> list[str]:
150-
return [s.strip() for s in script.split(by)]
149+
def get_split_terms(self, *, script: str, by: str, maxsplit: int = 1) -> list[str]:
150+
# https://docs.python.org/3/library/stdtypes.html#str.split
151+
# str.split(sep=None, maxsplit=-1)
152+
return [s.strip() for s in script.split(sep=by, maxsplit=maxsplit)]
151153

152154
def get_literal(self, *, text: str) -> str:
153155
literal = text
154156
try:
155157
literal = ast.literal_eval(text)
156-
except ValueError:
158+
except (ValueError, TypeError):
157159
pass
160+
except SyntaxError:
161+
# `literal_eval` strips special characters, leading to syntax errors
162+
if text.startswith("'") and text.endswith("'"):
163+
literal = text[1:-1]
158164
return literal
159165

160166
def get_listed_literal(self, *, text: str) -> list[str]:

0 commit comments

Comments
 (0)