11
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
-
15
-
16
14
import pynini
17
15
from pynini .lib import pynutil
18
16
19
- from nemo_text_processing .text_normalization .de .utils import get_abs_path , load_labels
20
- from nemo_text_processing .text_normalization .en .graph_utils import (
21
- NEMO_ALPHA ,
22
- NEMO_DIGIT ,
23
- NEMO_SPACE ,
24
- GraphFst ,
25
- at ,
26
- colon ,
27
- domain_string ,
28
- double_quotes ,
29
- double_slash ,
30
- http ,
31
- https ,
32
- period ,
33
- protocol_string ,
34
- username_string ,
35
- www ,
36
- )
17
+ from nemo_text_processing .text_normalization .en .graph_utils import NEMO_ALPHA , NEMO_DIGIT , GraphFst , insert_space
18
+ from nemo_text_processing .text_normalization .es .utils import get_abs_path , load_labels
19
+
20
+ common_domains = [x [0 ] for x in load_labels (get_abs_path ("data/electronic/domain.tsv" ))]
21
+ symbols = [x [0 ] for x in load_labels (get_abs_path ("data/electronic/symbols.tsv" ))]
37
22
38
23
39
24
class ElectronicFst (GraphFst ):
@@ -50,65 +35,45 @@ class ElectronicFst(GraphFst):
50
35
def __init__ (self , deterministic : bool = True ):
51
36
super ().__init__ (name = "electronic" , kind = "classify" , deterministic = deterministic )
52
37
53
- period_fst = pynini .accep (period )
54
-
55
- symbols = [x [0 ] for x in load_labels (get_abs_path ("data/electronic/symbols.tsv" ))]
56
- symbols = pynini .union (* symbols )
57
- # all symbols
58
- symbols_no_period = pynini .difference (symbols , period_fst ) # alphabet of accepted symbols excluding the '.'
59
- accepted_characters = pynini .closure (
60
- (NEMO_ALPHA | NEMO_DIGIT | symbols_no_period ), 1
61
- ) # alphabet of accepted chars excluding the '.'
62
- all_characters = pynini .closure (
63
- (NEMO_ALPHA | NEMO_DIGIT | symbols ), 1
64
- ) # alphabet of accepted chars including the '.'
65
-
66
- # domains
67
- domain = period_fst + accepted_characters
68
- domain_graph = (
69
- pynutil .insert (domain_string + colon + NEMO_SPACE + double_quotes )
70
- + (accepted_characters + pynini .closure (domain , 1 ))
71
- + pynutil .insert (double_quotes )
72
- )
38
+ dot = pynini .accep ("." )
39
+ accepted_common_domains = pynini .union (* common_domains )
40
+ accepted_symbols = pynini .union (* symbols ) - dot
41
+ accepted_characters = pynini .closure (NEMO_ALPHA | NEMO_DIGIT | accepted_symbols )
42
+ acceepted_characters_with_dot = pynini .closure (NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot )
73
43
74
44
# email
75
45
username = (
76
- pynutil .insert (username_string + colon + NEMO_SPACE + double_quotes )
77
- + all_characters
78
- + pynutil .insert (double_quotes )
79
- + pynini .cross (at , NEMO_SPACE )
46
+ pynutil .insert ("username: \" " )
47
+ + acceepted_characters_with_dot
48
+ + pynutil .insert (" \" " )
49
+ + pynini .cross ('@' , ' ' )
80
50
)
81
- email = username + domain_graph
82
-
83
- # social media tags
84
- tag = (
85
- pynutil . delete ( at )
86
- + pynutil . insert ( username_string + colon + NEMO_SPACE + double_quotes )
87
- + ( accepted_characters | ( accepted_characters + pynini .closure (domain , 1 )) )
88
- + pynutil .insert (double_quotes )
51
+ domain_graph = accepted_characters + dot + accepted_characters
52
+ domain_graph = pynutil . insert ( "domain: \" " ) + domain_graph + pynutil . insert ( " \" " )
53
+ domain_common_graph = (
54
+ pynutil . insert ( "domain: \" " )
55
+ + accepted_characters
56
+ + accepted_common_domains
57
+ + pynini . closure (( accepted_symbols | dot ) + pynini .closure (accepted_characters , 1 ), 0 , 1 )
58
+ + pynutil .insert (" \" " )
89
59
)
60
+ graph = (username + domain_graph ) | domain_common_graph
90
61
91
62
# url
92
- protocol_start = pynini .accep (https + colon + double_slash ) | pynini .accep (http + colon + double_slash )
63
+ protocol_start = pynini .accep (" https://" ) | pynini .accep (" http://" )
93
64
protocol_end = (
94
- pynini .accep (www + period )
65
+ pynini .accep (" www." )
95
66
if deterministic
96
67
else (
97
- pynini .accep (www + period )
98
- | pynini .cross (www + period , "vé vé vé." )
99
- | pynini .cross (www + period , "dupla vé dupla vé dupla vé." )
100
- | pynini .cross (www + period , "kettős vé kettős vé kettős vé." )
68
+ pynini .accep (" www." )
69
+ | pynini .cross (" www." , "vé vé vé." )
70
+ | pynini .cross (" www." , "dupla vé dupla vé dupla vé." )
71
+ | pynini .cross (" www." , "kettős vé kettős vé kettős vé." )
101
72
)
102
73
)
103
74
protocol = protocol_start | protocol_end | (protocol_start + protocol_end )
104
- protocol = (
105
- pynutil .insert (protocol_string + colon + NEMO_SPACE + double_quotes )
106
- + protocol
107
- + pynutil .insert (double_quotes )
108
- )
109
- url = protocol + pynutil .insert (NEMO_SPACE ) + (domain_graph )
110
-
111
- graph = url | domain_graph | email | tag
75
+ protocol = pynutil .insert ("protocol: \" " ) + protocol + pynutil .insert ("\" " )
76
+ graph |= protocol + insert_space + (domain_graph | domain_common_graph )
112
77
self .graph = graph
113
78
114
79
final_graph = self .add_tokens (self .graph + pynutil .insert (" preserve_order: true" ))
0 commit comments