Skip to content

Commit d09c80d

Browse files
committed
add namegen classes
1 parent 9b2b171 commit d09c80d

File tree

2 files changed

+287
-0
lines changed

2 files changed

+287
-0
lines changed

include/gf2/core/Namegen.h

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// SPDX-License-Identifier: Zlib
2+
// Copyright (c) 2023-2025 Julien Bernard
3+
#ifndef GF_NAMEGEN_H
4+
#define GF_NAMEGEN_H
5+
6+
#include <limits>
7+
#include <map>
8+
#include <optional>
9+
#include <string>
10+
#include <vector>
11+
12+
#include "CoreApi.h"
13+
#include "Random.h"
14+
#include "Time.h"
15+
16+
namespace gf {
17+
18+
class GF_CORE_API NamegenModel {
19+
public:
20+
NamegenModel(const std::vector<std::string>& data, std::size_t order, double prior, std::string alphabet);
21+
22+
std::optional<char> generate(const std::string& context, Random& random) const;
23+
void retrain(const std::vector<std::string>& data);
24+
25+
private:
26+
void train(const std::vector<std::string>& data);
27+
void build_chains();
28+
29+
std::size_t m_order = 1;
30+
double m_prior = 0.0;
31+
std::string m_alphabet;
32+
std::map<std::string, std::string> m_observations;
33+
std::map<std::string, std::vector<double>> m_chains;
34+
};
35+
36+
class GF_CORE_API NamegenGenerator {
37+
public:
38+
NamegenGenerator(const std::vector<std::string>& data, std::size_t order, double prior, bool backoff);
39+
40+
std::string generate(Random& random) const;
41+
42+
private:
43+
std::optional<char> compute_letter(const std::string& word, Random& random) const;
44+
45+
std::size_t m_order = 1;
46+
double m_prior = 0.0;
47+
bool m_backoff = true;
48+
std::vector<NamegenModel> m_models;
49+
50+
};
51+
52+
struct NamegenSettings {
53+
std::size_t min_length = 0;
54+
std::size_t max_length = std::numeric_limits<std::size_t>::max();
55+
std::string starts_with;
56+
std::string ends_with;
57+
std::string includes;
58+
std::string excludes;
59+
};
60+
61+
class GF_CORE_API NamegenManager {
62+
public:
63+
NamegenManager(const std::vector<std::string>& data, std::size_t order, double prior, bool backoff);
64+
65+
std::optional<std::string> generate_single(Random& random, const NamegenSettings& settings = {});
66+
std::vector<std::string> generate_multiple(Random& random, std::size_t count, Time max_time_per_name, const NamegenSettings& settings = {});
67+
68+
private:
69+
NamegenGenerator m_generator;
70+
};
71+
72+
}
73+
74+
#endif // GF_NAMEGEN_H

library/core/Namegen.cc

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
// SPDX-License-Identifier: Zlib
2+
// Copyright (c) 2023-2025 Julien Bernard
3+
4+
#include <gf2/core/Namegen.h>
5+
6+
#include <cassert>
7+
8+
#include <algorithm>
9+
#include <random>
10+
#include <set>
11+
12+
#include <gf2/core/Clock.h>
13+
14+
namespace gf {
15+
16+
namespace {
17+
18+
constexpr char WordLimit = '#';
19+
20+
}
21+
22+
/*
23+
* NameGeneratorModel
24+
*/
25+
26+
NamegenModel::NamegenModel(const std::vector<std::string>& data, std::size_t order, double prior, std::string alphabet)
27+
: m_order(order)
28+
, m_prior(prior)
29+
, m_alphabet(std::move(alphabet))
30+
{
31+
assert(0.0 <= prior && prior <= 1.0);
32+
train(data);
33+
build_chains();
34+
}
35+
36+
std::optional<char> NamegenModel::generate(const std::string& context, Random& random) const
37+
{
38+
assert(context.size() == m_order);
39+
auto iterator = m_chains.find(context);
40+
41+
if (iterator == m_chains.end()) {
42+
return std::nullopt;
43+
}
44+
45+
const std::vector<double>& weights = iterator->second;
46+
47+
std::discrete_distribution<std::size_t> distribution(weights.begin(), weights.end());
48+
auto index = distribution(random.engine());
49+
assert(index < m_alphabet.size());
50+
return m_alphabet[index];
51+
}
52+
53+
void NamegenModel::retrain(const std::vector<std::string>& data)
54+
{
55+
train(data);
56+
build_chains();
57+
}
58+
59+
void NamegenModel::train(const std::vector<std::string>& data)
60+
{
61+
for (const std::string& item : data) {
62+
std::string d = std::string(m_order, WordLimit) + item + WordLimit;
63+
64+
for (std::size_t i = 0; i < d.size() - m_order; ++i) {
65+
const std::string key = d.substr(i, m_order);
66+
assert(i + m_order < d.size());
67+
m_observations[key].push_back(d[i + m_order]);
68+
}
69+
}
70+
}
71+
72+
void NamegenModel::build_chains()
73+
{
74+
m_chains.clear();
75+
76+
for (auto& [ context, observation ] : m_observations) {
77+
std::vector<double>& values = m_chains[context];
78+
79+
for (char prediction : m_alphabet) {
80+
values.push_back(m_prior + static_cast<double>(std::count(observation.begin(), observation.end(), prediction)));
81+
}
82+
}
83+
}
84+
85+
/*
86+
* NamegenGenerator
87+
*/
88+
89+
NamegenGenerator::NamegenGenerator(const std::vector<std::string>& data, std::size_t order, double prior, bool backoff)
90+
: m_order(order)
91+
, m_prior(prior)
92+
, m_backoff(backoff)
93+
{
94+
std::set<char> letters;
95+
96+
for (const std::string& item : data) {
97+
for (const char c : item) {
98+
letters.insert(c);
99+
}
100+
}
101+
102+
std::string alphabet(letters.begin(), letters.end());
103+
alphabet.push_back(WordLimit);
104+
105+
if (backoff) {
106+
for (std::size_t i = 0; i < m_order; ++i) {
107+
m_models.emplace_back(data, order - i, prior, alphabet);
108+
}
109+
} else {
110+
m_models.emplace_back(data, order, prior, alphabet);
111+
}
112+
}
113+
114+
std::string NamegenGenerator::generate(Random& random) const
115+
{
116+
std::string word(m_order, WordLimit);
117+
118+
auto maybe_letter = compute_letter(word, random);
119+
120+
while (maybe_letter && maybe_letter.value() != WordLimit) {
121+
word.push_back(maybe_letter.value());
122+
maybe_letter = compute_letter(word, random);
123+
}
124+
125+
return word;
126+
}
127+
128+
std::optional<char> NamegenGenerator::compute_letter(const std::string& word, Random& random) const
129+
{
130+
assert(word.size() >= m_order);
131+
132+
std::string context = word.substr(word.size() - m_order);
133+
assert(context.size() == m_order);
134+
135+
for (const NamegenModel& model : m_models) {
136+
auto maybe_letter = model.generate(context, random);
137+
138+
if (maybe_letter && maybe_letter.value() != WordLimit) {
139+
return maybe_letter;
140+
}
141+
142+
context = context.substr(1);
143+
}
144+
145+
return std::nullopt;
146+
}
147+
148+
/*
149+
* NamegenManager
150+
*/
151+
152+
namespace {
153+
154+
bool satisfy_settings(const std::string& word, const NamegenSettings& settings)
155+
{
156+
if (word.size() < settings.min_length || word.size() > settings.max_length) {
157+
return false;
158+
}
159+
160+
if (settings.starts_with.size() > word.size() || word.substr(0, settings.starts_with.size()) != settings.starts_with) {
161+
return false;
162+
}
163+
164+
if (settings.ends_with.size() > word.size() || word.substr(word.size() - settings.ends_with.size()) != settings.ends_with) {
165+
return false;
166+
}
167+
168+
if (!settings.includes.empty() && word.find(settings.includes) == std::string::npos) {
169+
return false;
170+
}
171+
172+
if (!settings.excludes.empty() && word.find(settings.excludes) != std::string::npos) {
173+
return false;
174+
}
175+
176+
return true;
177+
}
178+
179+
}
180+
181+
NamegenManager::NamegenManager(const std::vector<std::string>& data, std::size_t order, double prior, bool backoff)
182+
: m_generator(data, order, prior, backoff)
183+
{
184+
}
185+
186+
std::optional<std::string> NamegenManager::generate_single(Random& random, const NamegenSettings& settings)
187+
{
188+
std::string name = m_generator.generate(random);
189+
name.erase(std::remove(name.begin(), name.end(), WordLimit), name.end());
190+
191+
if (satisfy_settings(name, settings)) {
192+
return name;
193+
}
194+
195+
return std::nullopt;
196+
}
197+
198+
std::vector<std::string> NamegenManager::generate_multiple(Random& random, std::size_t count, Time max_time_per_name, const NamegenSettings& settings)
199+
{
200+
std::vector<std::string> names;
201+
Clock clock;
202+
203+
while (names.size() < count && clock.restart() < max_time_per_name) {
204+
if (auto maybe_name = generate_single(random, settings); maybe_name) {
205+
names.push_back(std::move(maybe_name).value());
206+
}
207+
}
208+
209+
return names;
210+
}
211+
212+
}
213+

0 commit comments

Comments
 (0)