Skip to content

Commit 1054b80

Browse files
committed
use char32_t for namegen instead of char
1 parent 402184b commit 1054b80

File tree

3 files changed

+55
-40
lines changed

3 files changed

+55
-40
lines changed

bin/gf2_namegen.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ int main(int argc, char* argv[])
3131

3232
gf::NamegenSettings settings = {};
3333
settings.min_length = 3;
34-
settings.max_length = 15;
34+
settings.max_length = 12;
3535

3636
gf::Random random;
3737

include/gf2/core/Namegen.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,30 +17,30 @@ namespace gf {
1717

1818
class GF_CORE_API NamegenModel {
1919
public:
20-
NamegenModel(const std::vector<std::string>& data, std::size_t order, double prior, std::string alphabet);
20+
NamegenModel(const std::vector<std::u32string>& data, std::size_t order, double prior, std::u32string alphabet);
2121

22-
std::optional<char> generate(const std::string& context, Random& random) const;
23-
void retrain(const std::vector<std::string>& data);
22+
std::optional<char32_t> generate(const std::u32string& context, Random& random) const;
23+
void retrain(const std::vector<std::u32string>& data);
2424

2525
private:
26-
void train(const std::vector<std::string>& data);
26+
void train(const std::vector<std::u32string>& data);
2727
void build_chains();
2828

2929
std::size_t m_order = 1;
3030
double m_prior = 0.0;
31-
std::string m_alphabet;
32-
std::map<std::string, std::string> m_observations;
33-
std::map<std::string, std::vector<double>> m_chains;
31+
std::u32string m_alphabet;
32+
std::map<std::u32string, std::u32string> m_observations;
33+
std::map<std::u32string, std::vector<double>> m_chains;
3434
};
3535

3636
class GF_CORE_API NamegenGenerator {
3737
public:
38-
NamegenGenerator(const std::vector<std::string>& data, std::size_t order, double prior, bool backoff);
38+
NamegenGenerator(const std::vector<std::u32string>& data, std::size_t order, double prior, bool backoff);
3939

40-
std::string generate(Random& random) const;
40+
std::u32string generate(Random& random) const;
4141

4242
private:
43-
std::optional<char> compute_letter(const std::string& word, Random& random) const;
43+
std::optional<char32_t> compute_letter(const std::u32string& word, Random& random) const;
4444

4545
std::size_t m_order = 1;
4646
double m_prior = 0.0;

library/core/Namegen.cc

Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,26 @@
66
#include <cassert>
77

88
#include <algorithm>
9+
#include <iterator>
910
#include <random>
1011
#include <set>
1112

1213
#include <gf2/core/Clock.h>
14+
#include <gf2/core/StringUtils.h>
1315

1416
namespace gf {
1517

1618
namespace {
1719

18-
constexpr char WordLimit = '#';
20+
constexpr char32_t WordLimit = '#';
1921

2022
}
2123

2224
/*
2325
* NameGeneratorModel
2426
*/
2527

26-
NamegenModel::NamegenModel(const std::vector<std::string>& data, std::size_t order, double prior, std::string alphabet)
28+
NamegenModel::NamegenModel(const std::vector<std::u32string>& data, std::size_t order, double prior, std::u32string alphabet)
2729
: m_order(order)
2830
, m_prior(prior)
2931
, m_alphabet(std::move(alphabet))
@@ -33,7 +35,7 @@ namespace gf {
3335
build_chains();
3436
}
3537

36-
std::optional<char> NamegenModel::generate(const std::string& context, Random& random) const
38+
std::optional<char32_t> NamegenModel::generate(const std::u32string& context, Random& random) const
3739
{
3840
assert(context.size() == m_order);
3941
auto iterator = m_chains.find(context);
@@ -50,21 +52,21 @@ namespace gf {
5052
return m_alphabet[index];
5153
}
5254

53-
void NamegenModel::retrain(const std::vector<std::string>& data)
55+
void NamegenModel::retrain(const std::vector<std::u32string>& data)
5456
{
5557
train(data);
5658
build_chains();
5759
}
5860

59-
void NamegenModel::train(const std::vector<std::string>& data)
61+
void NamegenModel::train(const std::vector<std::u32string>& data)
6062
{
61-
for (const std::string& item : data) {
62-
std::string d = std::string(m_order, WordLimit) + item + WordLimit;
63+
for (const std::u32string& item : data) {
64+
const std::u32string sequence = std::u32string(m_order, WordLimit) + item + WordLimit;
6365

64-
for (std::size_t i = 0; i < d.size() - m_order; ++i) {
65-
const std::string key = d.substr(i, m_order);
66-
assert(i + m_order < d.size());
67-
m_observations[key].push_back(d[i + m_order]);
66+
for (std::size_t i = 0; i < sequence.size() - m_order; ++i) {
67+
const std::u32string key = sequence.substr(i, m_order);
68+
assert(i + m_order < sequence.size());
69+
m_observations[key].push_back(sequence[i + m_order]);
6870
}
6971
}
7072
}
@@ -76,7 +78,7 @@ namespace gf {
7678
for (auto& [ context, observation ] : m_observations) {
7779
std::vector<double>& values = m_chains[context];
7880

79-
for (char prediction : m_alphabet) {
81+
for (char32_t prediction : m_alphabet) {
8082
values.push_back(m_prior + static_cast<double>(std::count(observation.begin(), observation.end(), prediction)));
8183
}
8284
}
@@ -86,20 +88,20 @@ namespace gf {
8688
* NamegenGenerator
8789
*/
8890

89-
NamegenGenerator::NamegenGenerator(const std::vector<std::string>& data, std::size_t order, double prior, bool backoff)
91+
NamegenGenerator::NamegenGenerator(const std::vector<std::u32string>& data, std::size_t order, double prior, bool backoff)
9092
: m_order(order)
9193
, m_prior(prior)
9294
, m_backoff(backoff)
9395
{
94-
std::set<char> letters;
96+
std::set<char32_t> letters;
9597

96-
for (const std::string& item : data) {
97-
for (const char c : item) {
98+
for (const std::u32string& item : data) {
99+
for (const char32_t c : item) {
98100
letters.insert(c);
99101
}
100102
}
101103

102-
std::string alphabet(letters.begin(), letters.end());
104+
std::u32string alphabet(letters.begin(), letters.end());
103105
alphabet.push_back(WordLimit);
104106

105107
if (backoff) {
@@ -111,9 +113,9 @@ namespace gf {
111113
}
112114
}
113115

114-
std::string NamegenGenerator::generate(Random& random) const
116+
std::u32string NamegenGenerator::generate(Random& random) const
115117
{
116-
std::string word(m_order, WordLimit);
118+
std::u32string word(m_order, WordLimit);
117119

118120
auto maybe_letter = compute_letter(word, random);
119121

@@ -125,11 +127,11 @@ namespace gf {
125127
return word;
126128
}
127129

128-
std::optional<char> NamegenGenerator::compute_letter(const std::string& word, Random& random) const
130+
std::optional<char32_t> NamegenGenerator::compute_letter(const std::u32string& word, Random& random) const
129131
{
130132
assert(word.size() >= m_order);
131133

132-
std::string context = word.substr(word.size() - m_order);
134+
std::u32string context = word.substr(word.size() - m_order);
133135
assert(context.size() == m_order);
134136

135137
for (const NamegenModel& model : m_models) {
@@ -150,13 +152,20 @@ namespace gf {
150152
*/
151153

152154
namespace {
155+
std::vector<std::u32string> to_utf32_strings(const std::vector<std::string>& data)
156+
{
157+
std::vector<std::u32string> utf32_data;
158+
std::transform(data.begin(), data.end(), std::back_inserter(utf32_data), to_utf32);
159+
return utf32_data;
160+
}
153161

154-
bool satisfy_settings(const std::string& word, const NamegenSettings& settings)
162+
bool satisfy_size_settings(const std::u32string& word, const NamegenSettings& settings)
155163
{
156-
if (word.size() < settings.min_length || word.size() > settings.max_length) {
157-
return false;
158-
}
164+
return settings.min_length <= word.size() && word.size() <= settings.max_length;
165+
}
159166

167+
bool satisfy_settings(const std::string& word, const NamegenSettings& settings)
168+
{
160169
if (settings.starts_with.size() > word.size() || word.substr(0, settings.starts_with.size()) != settings.starts_with) {
161170
return false;
162171
}
@@ -179,17 +188,23 @@ namespace gf {
179188
}
180189

181190
NamegenManager::NamegenManager(const std::vector<std::string>& data, std::size_t order, double prior, bool backoff)
182-
: m_generator(data, order, prior, backoff)
191+
: m_generator(to_utf32_strings(data), order, prior, backoff)
183192
{
184193
}
185194

186195
std::optional<std::string> NamegenManager::generate_single(Random& random, const NamegenSettings& settings) const
187196
{
188-
std::string name = m_generator.generate(random);
197+
std::u32string name = m_generator.generate(random);
189198
name.erase(std::remove(name.begin(), name.end(), WordLimit), name.end());
190199

191-
if (satisfy_settings(name, settings)) {
192-
return name;
200+
if (!satisfy_size_settings(name, settings)) {
201+
return std::nullopt;
202+
}
203+
204+
std::string utf8_name = to_utf8(name);
205+
206+
if (satisfy_settings(utf8_name, settings)) {
207+
return utf8_name;
193208
}
194209

195210
return std::nullopt;

0 commit comments

Comments
 (0)