Skip to content

Commit d9adadc

Browse files
committed
Add Korean TN test files for cardinal
Signed-off-by: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com>
1 parent 3e4ac3e commit d9adadc

File tree

4 files changed

+189
-0
lines changed

4 files changed

+189
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
1~일
2+
2~이
3+
-2~마이너스 이
4+
3~삼
5+
123~백이십삼
6+
13000~만삼천
7+
9000~구천
8+
123000~십이만 삼천
9+
123000012~일억 이천삼백만 십이
10+
1000000~백만
11+
100000000~일억
12+
1000000000000~일조
13+
100000000000000~백조
14+
20000000000001~이십조 일
15+
800000000001001~팔백조 천일
16+
82345670123135111~팔경 이천삼백사십오조 육천칠백일억 이천삼백십삼만 오천백십일
17+
9999999999999~구조 구천구백구십구억 구천구백구십구만 구천구백구십구
18+
99999999999999~구십구조 구천구백구십구억 구천구백구십구만 구천구백구십구
19+
999999999999999~구백구십구조 구천구백구십구억 구천구백구십구만 구천구백구십구
20+
9999999999999999~구천구백구십구조 구천구백구십구억 구천구백구십구만 구천구백구십구
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
from parameterized import parameterized
17+
18+
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
19+
from nemo_text_processing.text_normalization.normalize import Normalizer
20+
21+
from ..utils import CACHE_DIR, parse_test_case_file
22+
23+
24+
class TestCardinal:
25+
normalizer_ko = Normalizer(lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased')
26+
27+
@parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_cardinal.txt'))
28+
@pytest.mark.run_only_on('CPU')
29+
@pytest.mark.unit
30+
def test_norm(self, test_input, expected):
31+
preds = self.normalizer_ko.normalize(test_input)
32+
assert expected == preds
33+
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#! /bin/sh
2+
GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
3+
TEST_DIR=${2:-"/workspace/tests"}
4+
5+
runtest () {
6+
input=$1
7+
echo "INPUT is $input"
8+
cd ${GRAMMARS_DIR}
9+
10+
# read test file
11+
while read testcase; do
12+
IFS='~' read written spoken <<< $testcase
13+
# replace non breaking space with breaking space
14+
# Use below if postprocessor is not used. Comment if it is used
15+
#denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')
16+
# Use below if postprocessor is used. Comment if it is not used
17+
denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')
18+
19+
# trim white space
20+
spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
21+
denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
22+
23+
# input expected actual
24+
assertEquals "$written" "$spoken" "$denorm_pred"
25+
done < "$input"
26+
}
27+
28+
29+
testTNCardinal() {
30+
input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt
31+
runtest $input
32+
}
33+
34+
#testTNSpecialText() {
35+
# input=$TEST_DIR/data_text_normalization/test_cases_special_text.txt
36+
# runtest $input
37+
#}
38+
39+
#testTNDate() {
40+
# input=$TEST_DIR/data_text_normalization/test_cases_date.txt
41+
# runtest $input
42+
#}
43+
44+
#testTNDecimal() {
45+
# input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt
46+
# runtest $input
47+
#}
48+
49+
#testTNRange() {
50+
# input=$TEST_DIR/data_text_normalization/test_cases_range.txt
51+
# runtest $input
52+
#}
53+
54+
#testTNSerial() {
55+
# input=$TEST_DIR/data_text_normalization/test_cases_serial.txt
56+
# runtest $input
57+
#}
58+
59+
#testTNRoman() {
60+
# input=$TEST_DIR/data_text_normalization/test_cases_roman.txt
61+
# runtest $input
62+
#}
63+
64+
#testTNElectronic() {
65+
# input=$TEST_DIR/data_text_normalization/test_cases_electronic.txt
66+
# runtest $input
67+
#}
68+
69+
#testTNFraction() {
70+
# input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt
71+
# runtest $input
72+
#}
73+
74+
#testTNMoney() {
75+
# input=$TEST_DIR/data_text_normalization/test_cases_money.txt
76+
# runtest $input
77+
#}
78+
79+
#testTNOrdinal() {
80+
# input=$TEST_DIR/data_text_normalization/test_cases_ordinal.txt
81+
# runtest $input
82+
#}
83+
84+
#testTNTelephone() {
85+
# input=$TEST_DIR/data_text_normalization/test_cases_telephone.txt
86+
# runtest $input
87+
#}
88+
89+
#testTNTime() {
90+
# input=$TEST_DIR/data_text_normalization/test_cases_time.txt
91+
# runtest $input
92+
#}
93+
94+
#testTNMeasure() {
95+
# input=$TEST_DIR/data_text_normalization/test_cases_measure.txt
96+
# runtest $input
97+
#}
98+
99+
#testTNWhitelist() {
100+
# input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt
101+
# runtest $input
102+
#}
103+
104+
#testTNWord() {
105+
# input=$TEST_DIR/data_text_normalization/test_cases_word.txt
106+
# runtest $input
107+
#}
108+
109+
#testTNAddress() {
110+
# input=$TEST_DIR/data_text_normalization/test_cases_address.txt
111+
# runtest $input
112+
#}
113+
114+
#testTNMath() {
115+
# input=$TEST_DIR/data_text_normalization/test_cases_math.txt
116+
# runtest $input
117+
#}
118+
119+
# Remove all command-line arguments
120+
shift $#
121+
122+
# Load shUnit2
123+
. /workspace/shunit2/shunit2

0 commit comments

Comments
 (0)