11
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
+
15
+
14
16
import pynini
15
17
from pynini .lib import pynutil
16
18
17
- from nemo_text_processing .text_normalization .en .graph_utils import NEMO_ALPHA , NEMO_DIGIT , GraphFst , insert_space
18
- from nemo_text_processing .text_normalization .es .utils import get_abs_path , load_labels
19
-
20
- common_domains = [x [0 ] for x in load_labels (get_abs_path ("data/electronic/domain.tsv" ))]
21
- symbols = [x [0 ] for x in load_labels (get_abs_path ("data/electronic/symbols.tsv" ))]
19
+ from nemo_text_processing .text_normalization .de .utils import get_abs_path , load_labels
20
+ from nemo_text_processing .text_normalization .en .graph_utils import (
21
+ NEMO_ALPHA ,
22
+ NEMO_DIGIT ,
23
+ NEMO_SPACE ,
24
+ GraphFst ,
25
+ at ,
26
+ colon ,
27
+ domain_string ,
28
+ double_quotes ,
29
+ double_slash ,
30
+ http ,
31
+ https ,
32
+ period ,
33
+ protocol_string ,
34
+ username_string ,
35
+ www ,
36
+ )
22
37
23
38
24
39
class ElectronicFst (GraphFst ):
@@ -35,45 +50,65 @@ class ElectronicFst(GraphFst):
35
50
def __init__ (self , deterministic : bool = True ):
36
51
super ().__init__ (name = "electronic" , kind = "classify" , deterministic = deterministic )
37
52
38
- dot = pynini .accep ("." )
39
- accepted_common_domains = pynini .union (* common_domains )
40
- accepted_symbols = pynini .union (* symbols ) - dot
41
- accepted_characters = pynini .closure (NEMO_ALPHA | NEMO_DIGIT | accepted_symbols )
42
- acceepted_characters_with_dot = pynini .closure (NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot )
53
+ period_fst = pynini .accep (period )
54
+
55
+ symbols = [x [0 ] for x in load_labels (get_abs_path ("data/electronic/symbols.tsv" ))]
56
+ symbols = pynini .union (* symbols )
57
+ # all symbols
58
+ symbols_no_period = pynini .difference (symbols , period_fst ) # alphabet of accepted symbols excluding the '.'
59
+ accepted_characters = pynini .closure (
60
+ (NEMO_ALPHA | NEMO_DIGIT | symbols_no_period ), 1
61
+ ) # alphabet of accepted chars excluding the '.'
62
+ all_characters = pynini .closure (
63
+ (NEMO_ALPHA | NEMO_DIGIT | symbols ), 1
64
+ ) # alphabet of accepted chars including the '.'
65
+
66
+ # domains
67
+ domain = period_fst + accepted_characters
68
+ domain_graph = (
69
+ pynutil .insert (domain_string + colon + NEMO_SPACE + double_quotes )
70
+ + (accepted_characters + pynini .closure (domain , 1 ))
71
+ + pynutil .insert (double_quotes )
72
+ )
43
73
44
74
# email
45
75
username = (
46
- pynutil .insert ("username: \" " )
47
- + acceepted_characters_with_dot
48
- + pynutil .insert (" \" " )
49
- + pynini .cross ('@' , ' ' )
76
+ pynutil .insert (username_string + colon + NEMO_SPACE + double_quotes )
77
+ + all_characters
78
+ + pynutil .insert (double_quotes )
79
+ + pynini .cross (at , NEMO_SPACE )
50
80
)
51
- domain_graph = accepted_characters + dot + accepted_characters
52
- domain_graph = pynutil . insert ( "domain: \" " ) + domain_graph + pynutil . insert ( " \" " )
53
- domain_common_graph = (
54
- pynutil . insert ( "domain: \" " )
55
- + accepted_characters
56
- + accepted_common_domains
57
- + pynini . closure (( accepted_symbols | dot ) + pynini .closure (accepted_characters , 1 ), 0 , 1 )
58
- + pynutil .insert (" \" " )
81
+ email = username + domain_graph
82
+
83
+ # social media tags
84
+ tag = (
85
+ pynutil . delete ( at )
86
+ + pynutil . insert ( username_string + colon + NEMO_SPACE + double_quotes )
87
+ + ( accepted_characters | ( accepted_characters + pynini .closure (domain , 1 )) )
88
+ + pynutil .insert (double_quotes )
59
89
)
60
- graph = (username + domain_graph ) | domain_common_graph
61
90
62
91
# url
63
- protocol_start = pynini .accep (" https://" ) | pynini .accep (" http://" )
92
+ protocol_start = pynini .accep (https + colon + double_slash ) | pynini .accep (http + colon + double_slash )
64
93
protocol_end = (
65
- pynini .accep (" www." )
94
+ pynini .accep (www + period )
66
95
if deterministic
67
96
else (
68
- pynini .accep (" www." )
69
- | pynini .cross (" www." , "vé vé vé." )
70
- | pynini .cross (" www." , "dupla vé dupla vé dupla vé." )
71
- | pynini .cross (" www." , "kettős vé kettős vé kettős vé." )
97
+ pynini .accep (www + period )
98
+ | pynini .cross (www + period , "vé vé vé." )
99
+ | pynini .cross (www + period , "dupla vé dupla vé dupla vé." )
100
+ | pynini .cross (www + period , "kettős vé kettős vé kettős vé." )
72
101
)
73
102
)
74
103
protocol = protocol_start | protocol_end | (protocol_start + protocol_end )
75
- protocol = pynutil .insert ("protocol: \" " ) + protocol + pynutil .insert ("\" " )
76
- graph |= protocol + insert_space + (domain_graph | domain_common_graph )
104
+ protocol = (
105
+ pynutil .insert (protocol_string + colon + NEMO_SPACE + double_quotes )
106
+ + protocol
107
+ + pynutil .insert (double_quotes )
108
+ )
109
+ url = protocol + pynutil .insert (NEMO_SPACE ) + (domain_graph )
110
+
111
+ graph = url | domain_graph | email | tag
77
112
self .graph = graph
78
113
79
114
final_graph = self .add_tokens (self .graph + pynutil .insert (" preserve_order: true" ))
0 commit comments