Skip to content

Commit ab95888

Browse files
authored
Merge pull request #16 from jftuga/normalize
Add fraction symbol normalization
2 parents 6e5b7fe + fcd82d3 commit ab95888

File tree

3 files changed

+11
-2
lines changed

3 files changed

+11
-2
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ test-install: clean
6060
# Production PyPI targets
6161
prod-publish: clean check-pypirc $(VENV_NAME) build
6262
@echo "Are you sure you want to publish to production PyPI? [y/N] " && read ans && [ $${ans:-N} = y ]
63-
./$(VENV_NAME)/bin/twine --verbose upload dist/*
63+
./$(VENV_NAME)/bin/twine upload --verbose dist/*
6464

6565
prod-install: clean
6666
$(PYTHON) -m venv prod-install-venv

deidentification/deidentification_constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
pgmName = "deidentification"
44
pgmUrl = "https://github.com/jftuga/deidentification"
5-
pgmVersion = "1.3.1"
5+
pgmVersion = "1.3.2"
66

77
# the maps the default replacement word for each language
88
class DeidentificationLanguages(Enum):

deidentification/normalize_punctuation.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ def normalize_punctuation(text: str) -> str:
88
- Converts ellipsis character to three periods
99
- Converts various spaces to regular space
1010
- Converts bullet points to asterisk
11+
- Converts fraction symbols (like ½) to ASCII representations (like 1/2)
1112
- Preserves but normalizes common symbols (©, ®, ™)
1213
1314
Args:
@@ -63,6 +64,13 @@ def normalize_punctuation(text: str) -> str:
6364
chr(0x00B7): '*', # MIDDLE DOT
6465
chr(0x2219): '*', # BULLET OPERATOR
6566

67+
# Fraction characters
68+
chr(0x00BD): ' 1/2', # FRACTION ONE HALF (½)
69+
chr(0x00BC): ' 1/4', # FRACTION ONE QUARTER
70+
chr(0x00BE): ' 3/4', # FRACTION THREE QUARTERS
71+
chr(0x2153): ' 1/3', # FRACTION ONE THIRD
72+
chr(0x2154): ' 2/3', # FRACTION TWO THIRDS
73+
6674
# Normalize common symbols
6775
chr(0x00A9): '(c)', # COPYRIGHT SIGN
6876
chr(0x00AE): '(r)', # REGISTERED SIGN
@@ -75,3 +83,4 @@ def normalize_punctuation(text: str) -> str:
7583
normalized_text = normalized_text.replace(unicode_char, ascii_char)
7684

7785
return normalized_text
86+

0 commit comments

Comments
 (0)