6
6
#include < cassert>
7
7
8
8
#include < algorithm>
9
+ #include < iterator>
9
10
#include < random>
10
11
#include < set>
11
12
12
13
#include < gf2/core/Clock.h>
14
+ #include < gf2/core/StringUtils.h>
13
15
14
16
namespace gf {
15
17
16
18
namespace {
17
19
18
- constexpr char WordLimit = ' #' ;
20
+ constexpr char32_t WordLimit = ' #' ;
19
21
20
22
}
21
23
22
24
/*
23
25
* NameGeneratorModel
24
26
*/
25
27
26
- NamegenModel::NamegenModel (const std::vector<std::string >& data, std::size_t order, double prior, std::string alphabet)
28
+ NamegenModel::NamegenModel (const std::vector<std::u32string >& data, std::size_t order, double prior, std::u32string alphabet)
27
29
: m_order(order)
28
30
, m_prior(prior)
29
31
, m_alphabet(std::move(alphabet))
@@ -33,7 +35,7 @@ namespace gf {
33
35
build_chains ();
34
36
}
35
37
36
- std::optional<char > NamegenModel::generate (const std::string & context, Random& random) const
38
+ std::optional<char32_t > NamegenModel::generate (const std::u32string & context, Random& random) const
37
39
{
38
40
assert (context.size () == m_order);
39
41
auto iterator = m_chains.find (context);
@@ -50,21 +52,21 @@ namespace gf {
50
52
return m_alphabet[index];
51
53
}
52
54
53
- void NamegenModel::retrain (const std::vector<std::string >& data)
55
+ void NamegenModel::retrain (const std::vector<std::u32string >& data)
54
56
{
55
57
train (data);
56
58
build_chains ();
57
59
}
58
60
59
- void NamegenModel::train (const std::vector<std::string >& data)
61
+ void NamegenModel::train (const std::vector<std::u32string >& data)
60
62
{
61
- for (const std::string & item : data) {
62
- std::string d = std::string (m_order, WordLimit) + item + WordLimit;
63
+ for (const std::u32string & item : data) {
64
+ const std::u32string sequence = std::u32string (m_order, WordLimit) + item + WordLimit;
63
65
64
- for (std::size_t i = 0 ; i < d .size () - m_order; ++i) {
65
- const std::string key = d .substr (i, m_order);
66
- assert (i + m_order < d .size ());
67
- m_observations[key].push_back (d [i + m_order]);
66
+ for (std::size_t i = 0 ; i < sequence .size () - m_order; ++i) {
67
+ const std::u32string key = sequence .substr (i, m_order);
68
+ assert (i + m_order < sequence .size ());
69
+ m_observations[key].push_back (sequence [i + m_order]);
68
70
}
69
71
}
70
72
}
@@ -76,7 +78,7 @@ namespace gf {
76
78
for (auto & [ context, observation ] : m_observations) {
77
79
std::vector<double >& values = m_chains[context];
78
80
79
- for (char prediction : m_alphabet) {
81
+ for (char32_t prediction : m_alphabet) {
80
82
values.push_back (m_prior + static_cast <double >(std::count (observation.begin (), observation.end (), prediction)));
81
83
}
82
84
}
@@ -86,20 +88,20 @@ namespace gf {
86
88
* NamegenGenerator
87
89
*/
88
90
89
- NamegenGenerator::NamegenGenerator (const std::vector<std::string >& data, std::size_t order, double prior, bool backoff)
91
+ NamegenGenerator::NamegenGenerator (const std::vector<std::u32string >& data, std::size_t order, double prior, bool backoff)
90
92
: m_order(order)
91
93
, m_prior(prior)
92
94
, m_backoff(backoff)
93
95
{
94
- std::set<char > letters;
96
+ std::set<char32_t > letters;
95
97
96
- for (const std::string & item : data) {
97
- for (const char c : item) {
98
+ for (const std::u32string & item : data) {
99
+ for (const char32_t c : item) {
98
100
letters.insert (c);
99
101
}
100
102
}
101
103
102
- std::string alphabet (letters.begin (), letters.end ());
104
+ std::u32string alphabet (letters.begin (), letters.end ());
103
105
alphabet.push_back (WordLimit);
104
106
105
107
if (backoff) {
@@ -111,9 +113,9 @@ namespace gf {
111
113
}
112
114
}
113
115
114
- std::string NamegenGenerator::generate (Random& random) const
116
+ std::u32string NamegenGenerator::generate (Random& random) const
115
117
{
116
- std::string word (m_order, WordLimit);
118
+ std::u32string word (m_order, WordLimit);
117
119
118
120
auto maybe_letter = compute_letter (word, random);
119
121
@@ -125,11 +127,11 @@ namespace gf {
125
127
return word;
126
128
}
127
129
128
- std::optional<char > NamegenGenerator::compute_letter (const std::string & word, Random& random) const
130
+ std::optional<char32_t > NamegenGenerator::compute_letter (const std::u32string & word, Random& random) const
129
131
{
130
132
assert (word.size () >= m_order);
131
133
132
- std::string context = word.substr (word.size () - m_order);
134
+ std::u32string context = word.substr (word.size () - m_order);
133
135
assert (context.size () == m_order);
134
136
135
137
for (const NamegenModel& model : m_models) {
@@ -150,13 +152,20 @@ namespace gf {
150
152
*/
151
153
152
154
namespace {
155
+ std::vector<std::u32string> to_utf32_strings (const std::vector<std::string>& data)
156
+ {
157
+ std::vector<std::u32string> utf32_data;
158
+ std::transform (data.begin (), data.end (), std::back_inserter (utf32_data), to_utf32);
159
+ return utf32_data;
160
+ }
153
161
154
- bool satisfy_settings (const std::string & word, const NamegenSettings& settings)
162
+ bool satisfy_size_settings (const std::u32string & word, const NamegenSettings& settings)
155
163
{
156
- if (word.size () < settings.min_length || word.size () > settings.max_length ) {
157
- return false ;
158
- }
164
+ return settings.min_length <= word.size () && word.size () <= settings.max_length ;
165
+ }
159
166
167
+ bool satisfy_settings (const std::string& word, const NamegenSettings& settings)
168
+ {
160
169
if (settings.starts_with .size () > word.size () || word.substr (0 , settings.starts_with .size ()) != settings.starts_with ) {
161
170
return false ;
162
171
}
@@ -179,17 +188,23 @@ namespace gf {
179
188
}
180
189
181
190
NamegenManager::NamegenManager (const std::vector<std::string>& data, std::size_t order, double prior, bool backoff)
182
- : m_generator(data, order, prior, backoff)
191
+ : m_generator(to_utf32_strings( data) , order, prior, backoff)
183
192
{
184
193
}
185
194
186
195
std::optional<std::string> NamegenManager::generate_single (Random& random, const NamegenSettings& settings) const
187
196
{
188
- std::string name = m_generator.generate (random);
197
+ std::u32string name = m_generator.generate (random);
189
198
name.erase (std::remove (name.begin (), name.end (), WordLimit), name.end ());
190
199
191
- if (satisfy_settings (name, settings)) {
192
- return name;
200
+ if (!satisfy_size_settings (name, settings)) {
201
+ return std::nullopt;
202
+ }
203
+
204
+ std::string utf8_name = to_utf8 (name);
205
+
206
+ if (satisfy_settings (utf8_name, settings)) {
207
+ return utf8_name;
193
208
}
194
209
195
210
return std::nullopt;
0 commit comments