Skip to content

Commit 6c0bebe

Browse files
committed
Implement lazy enumeration of fuzzy trie
1 parent c231400 commit 6c0bebe

File tree

11 files changed

+338
-139
lines changed

11 files changed

+338
-139
lines changed

Levenshtypo.sln

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ Global
3939
{492B31DC-9CA1-49AC-8592-F65CCE26EA07}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
4040
{492B31DC-9CA1-49AC-8592-F65CCE26EA07}.Debug|Any CPU.Build.0 = Debug|Any CPU
4141
{492B31DC-9CA1-49AC-8592-F65CCE26EA07}.Release|Any CPU.ActiveCfg = Release|Any CPU
42-
{492B31DC-9CA1-49AC-8592-F65CCE26EA07}.Release|Any CPU.Build.0 = Release|Any CPU
4342
{EA373273-9ED6-45EC-9AC4-CC01A19B5B24}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
4443
{EA373273-9ED6-45EC-9AC4-CC01A19B5B24}.Debug|Any CPU.Build.0 = Debug|Any CPU
4544
{EA373273-9ED6-45EC-9AC4-CC01A19B5B24}.Release|Any CPU.ActiveCfg = Release|Any CPU

README.md

Lines changed: 30 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,7 @@ public class TypoSuggestionExample
4444

4545
public TypoSuggestionExample(IEnumerable<string> words)
4646
{
47-
_trie = Levenshtrie<string>.Create(
48-
words.Select(w => new KeyValuePair<string, string>(w, w)),
49-
ignoreCase: true);
47+
_trie = Levenshtrie.CreateStrings(words, ignoreCase: true);
5048
}
5149

5250
public string[] GetSimilarWords(string word)
@@ -72,24 +70,15 @@ public class BlacklistDetectionExample
7270

7371
public BlacklistDetectionExample(IEnumerable<string> blacklist)
7472
{
75-
_trie = Levenshtrie<string>.Create(
76-
blacklist.Select(w => new KeyValuePair<string, string>(w, w)),
77-
ignoreCase: true);
73+
_trie = Levenshtrie.CreateStrings(blacklist, ignoreCase: true);
7874
}
7975

8076
public bool IsBlacklisted(string word)
8177
{
82-
LevenshtrieSearchResult<string>[] searchResults = _trie.Search(word, maxEditDistance: 2);
83-
return searchResults.Any(result => DetailedCompare(result.Distance, result.Result, word));
84-
}
85-
86-
private bool DetailedCompare(int distance, string blacklistedWord, string word)
87-
{
88-
// Your custom logic goes here
89-
return true;
78+
IEnumerable<LevenshtrieSearchResult<string>> searchResults = _trie.EnumerateSearch(word, maxEditDistance: 1);
79+
return searchResults.Any();
9080
}
9181
}
92-
9382
```
9483

9584
</details>
@@ -209,30 +198,30 @@ The English Language dataset used in the benchmarks contains approximately 465,0
209198
<summary>Search all English Language with a fuzzy key</summary>
210199

211200
- **Naive**: Compute Levenshtein Distance against all words.
212-
- **Levenshtypo**: This library.
201+
- **Levenshtypo_All**: This library, with all results buffered into an array.
202+
- **Levenshtypo_Lazy**: This library, with lazy evaluation (`IEnumerable`).
203+
- **Levenshtypo_Any**: This library, with lazy evaluation (`IEnumerable`), stopping at the first result.
213204
- **Dictionary**: .NET Dictionary which only works for distance of 0.
214205

215-
```
216-
217-
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3880/23H2/2023Update/SunValley3)
218-
AMD Ryzen 9 5950X, 1 CPU, 32 logical and 16 physical cores
219-
.NET SDK 8.0.400-preview.0.24324.5
220-
[Host] : .NET 8.0.6 (8.0.624.26715), X64 RyuJIT AVX2
221-
DefaultJob : .NET 8.0.6 (8.0.624.26715), X64 RyuJIT AVX2
222-
223-
224-
```
225-
| Method | Mean | Error | StdDev | Gen0 | Allocated |
226-
|---------------------- |------------------:|------------------:|------------------:|-------:|----------:|
227-
| Distance0_Dictionary | 8.548 ns | 0.0096 ns | 0.0081 ns | - | - |
228-
| Distance0_Levenshtypo | 331.396 ns | 1.0820 ns | 0.9035 ns | 0.0124 | 208 B |
229-
| Distance1_Levenshtypo | 18,655.543 ns | 176.4438 ns | 156.4128 ns | - | 424 B |
230-
| Distance2_Levenshtypo | 260,006.508 ns | 952.2781 ns | 844.1697 ns | - | 1832 B |
231-
| Distance3_Levenshtypo | 1,518,877.956 ns | 25,025.3556 ns | 23,408.7332 ns | - | 17905 B |
232-
| Distance0_Naive | 805,520.354 ns | 15,697.4007 ns | 13,915.3369 ns | - | 89 B |
233-
| Distance1_Naive | 68,290,143.333 ns | 1,318,565.4424 ns | 1,233,386.9329 ns | - | 180 B |
234-
| Distance2_Naive | 71,591,125.123 ns | 1,408,712.7964 ns | 2,064,872.9517 ns | - | 713 B |
235-
| Distance3_Naive | 70,418,511.111 ns | 1,378,967.7153 ns | 1,933,120.1471 ns | - | 4356 B |
206+
| Method | Mean | Allocated |
207+
|--------------------------- |------------------:|----------:|
208+
| Distance0_Levenshtypo_All | 361.444 ns | 240 B |
209+
| Distance0_Levenshtypo_Lazy | 975.169 ns | 480 B |
210+
| Distance0_Levenshtypo_Any | 614.947 ns | 480 B |
211+
| Distance0_Dictionary | 9.128 ns | - |
212+
| Distance0_Naive | 813,419.616 ns | 89 B |
213+
| Distance1_Levenshtypo_All | 19,008.096 ns | 536 B |
214+
| Distance1_Levenshtypo_Lazy | 38,615.868 ns | 480 B |
215+
| Distance1_Levenshtypo_Any | 25,805.258 ns | 480 B |
216+
| Distance1_Naive | 73,459,775.661 ns | 193 B |
217+
| Distance2_Levenshtypo_All | 276,157.020 ns | 2600 B |
218+
| Distance2_Levenshtypo_Lazy | 440,689.397 ns | 480 B |
219+
| Distance2_Levenshtypo_Any | 215,542.244 ns | 480 B |
220+
| Distance2_Naive | 68,999,745.833 ns | 700 B |
221+
| Distance3_Levenshtypo_All | 1,617,282.340 ns | 25985 B |
222+
| Distance3_Levenshtypo_Lazy | 2,452,026.901 ns | 1123 B |
223+
| Distance3_Levenshtypo_Any | 231,972.804 ns | 584 B |
224+
| Distance3_Naive | 71,845,738.624 ns | 4369 B |
236225

237226
</details>
238227

@@ -242,20 +231,10 @@ AMD Ryzen 9 5950X, 1 CPU, 32 logical and 16 physical cores
242231
- **Levenshtypo**: This library.
243232
- **Dictionary**: .NET Dictionary for comparison.
244233

245-
```
246-
247-
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3880/23H2/2023Update/SunValley3)
248-
AMD Ryzen 9 5950X, 1 CPU, 32 logical and 16 physical cores
249-
.NET SDK 8.0.400-preview.0.24324.5
250-
[Host] : .NET 8.0.6 (8.0.624.26715), X64 RyuJIT AVX2
251-
DefaultJob : .NET 8.0.6 (8.0.624.26715), X64 RyuJIT AVX2
252-
253-
254-
```
255-
| Method | Mean | Error | StdDev | Gen0 | Gen1 | Gen2 | Allocated |
256-
|-------------------- |--------------:|-------------:|-------------:|----------:|---------:|---------:|-------------:|
257-
| English_Dictionary | 34,213.49 μs | 665.436 μs | 1,074.555 μs | 750.0000 | 750.0000 | 750.0000 | 35524.21 KB |
258-
| English_Levenshtypo | 139,977.62 μs | 1,479.846 μs | 1,384.249 μs | 4250.0000 | 750.0000 | 750.0000 | 168067.98 KB |
234+
| Method | Mean | Allocated |
235+
|-------------------- |--------------:|-------------:|
236+
| English_Dictionary | 31,755.45 μs | 35524.19 KB |
237+
| English_Levenshtypo | 142,010.47 μs | 145145.15 KB |
259238

260239
</details>
261240

samples/Levenshtypo.Samples/BlacklistDetectionExample.cs

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,20 @@
22

33
/// <summary>
44
/// An example class wrapping Levenshtypo library to detect if
5-
/// a given word is similar to a blacklisted word.
5+
/// a given word is similar enough to a blacklisted word.
66
/// </summary>
77
public class BlacklistDetectionExample
88
{
99
private readonly Levenshtrie<string> _trie;
1010

1111
public BlacklistDetectionExample(IEnumerable<string> blacklist)
1212
{
13-
_trie = Levenshtrie<string>.Create(
14-
blacklist.Select(w => new KeyValuePair<string, string>(w, w)),
15-
ignoreCase: true);
13+
_trie = Levenshtrie.CreateStrings(blacklist, ignoreCase: true);
1614
}
1715

1816
public bool IsBlacklisted(string word)
1917
{
20-
LevenshtrieSearchResult<string>[] searchResults = _trie.Search(word, maxEditDistance: 2);
21-
return searchResults.Any(result => DetailedCompare(result.Distance, result.Result, word));
22-
}
23-
24-
private bool DetailedCompare(int distance, string blacklistedWord, string word)
25-
{
26-
// Your custom logic goes here
27-
return true;
18+
IEnumerable<LevenshtrieSearchResult<string>> searchResults = _trie.EnumerateSearch(word, maxEditDistance: 1);
19+
return searchResults.Any();
2820
}
2921
}

samples/Levenshtypo.Samples/BooleanCombinationsExample.cs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ public class BooleanCombinationsExample
1111

1212
public BooleanCombinationsExample(IEnumerable<string> words)
1313
{
14-
_trie = Levenshtrie<string>.Create(
15-
words.Select(w => new KeyValuePair<string, string>(w, w)));
14+
_trie = Levenshtrie.CreateStrings(words);
1615
}
1716

1817
public string[] SearchCommon(string a, string b)

samples/Levenshtypo.Samples/TypoSuggestionExample.cs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@ public class TypoSuggestionExample
1010

1111
public TypoSuggestionExample(IEnumerable<string> words)
1212
{
13-
_trie = Levenshtrie<string>.Create(
14-
words.Select(w => new KeyValuePair<string, string>(w, w)),
15-
ignoreCase: true);
13+
_trie = Levenshtrie.CreateStrings(words, ignoreCase: true);
1614
}
1715

1816
public string[] GetSimilarWords(string word)

src/Levenshtypo.Benchmarks/LevenshtrieSearchTests.cs

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,15 @@ public class LevenshtrieSearchTests
1919
[Benchmark]
2020
public object Distance0_Dictionary() => _dictionary[SearchWord];
2121

22+
[Benchmark]
23+
public object Distance0_Levenshtypo_All() => _levenshtrie.Search(_automaton0);
24+
25+
[Benchmark]
26+
public object Distance0_Levenshtypo_Lazy() => _levenshtrie.EnumerateSearch(_automaton0).Count();
27+
28+
[Benchmark]
29+
public object Distance0_Levenshtypo_Any() => _levenshtrie.EnumerateSearch(_automaton0).Any();
30+
2231
[Benchmark]
2332
public object Distance0_Naive()
2433
{
@@ -35,7 +44,13 @@ public object Distance0_Naive()
3544
}
3645

3746
[Benchmark]
38-
public object Distance0_Levenshtypo() => _levenshtrie.Search(_automaton0);
47+
public object Distance1_Levenshtypo_All() => _levenshtrie.Search(_automaton1);
48+
49+
[Benchmark]
50+
public object Distance1_Levenshtypo_Lazy() => _levenshtrie.EnumerateSearch(_automaton1).Count();
51+
52+
[Benchmark]
53+
public object Distance1_Levenshtypo_Any() => _levenshtrie.EnumerateSearch(_automaton1).Any();
3954

4055
[Benchmark]
4156
public object Distance1_Naive()
@@ -53,7 +68,13 @@ public object Distance1_Naive()
5368
}
5469

5570
[Benchmark]
56-
public object Distance1_Levenshtypo() => _levenshtrie.Search(_automaton1);
71+
public object Distance2_Levenshtypo_All() => _levenshtrie.Search(_automaton2);
72+
73+
[Benchmark]
74+
public object Distance2_Levenshtypo_Lazy() => _levenshtrie.EnumerateSearch(_automaton2).Count();
75+
76+
[Benchmark]
77+
public object Distance2_Levenshtypo_Any() => _levenshtrie.EnumerateSearch(_automaton2).Any();
5778

5879
[Benchmark]
5980
public object Distance2_Naive()
@@ -71,7 +92,13 @@ public object Distance2_Naive()
7192
}
7293

7394
[Benchmark]
74-
public object Distance2_Levenshtypo() => _levenshtrie.Search(_automaton2);
95+
public object Distance3_Levenshtypo_All() => _levenshtrie.Search(_automaton3);
96+
97+
[Benchmark]
98+
public object Distance3_Levenshtypo_Lazy() => _levenshtrie.EnumerateSearch(_automaton3).Count();
99+
100+
[Benchmark]
101+
public object Distance3_Levenshtypo_Any() => _levenshtrie.EnumerateSearch(_automaton3).Any();
75102

76103
[Benchmark]
77104
public object Distance3_Naive()
@@ -87,7 +114,4 @@ public object Distance3_Naive()
87114
}
88115
return results;
89116
}
90-
91-
[Benchmark]
92-
public object Distance3_Levenshtypo() => _levenshtrie.Search(_automaton3);
93117
}

src/Levenshtypo.Tests/LevenshtrieSearchTests.cs

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using Shouldly;
1+
using System.Diagnostics.CodeAnalysis;
2+
using Shouldly;
23

34
namespace Levenshtypo.Tests;
45

@@ -10,30 +11,20 @@ public void EmptyString()
1011
string[] entries = ["", "1", "12", "123"];
1112
var t = Levenshtrie<string>.Create(entries.Select(e => new KeyValuePair<string, string>(e, e)));
1213

13-
t.Search(new LevenshtomatonFactory().Construct("", 2)).Select(r => r.Result)
14-
.ShouldBe(["", "1", "12"], ignoreOrder: true);
15-
16-
t.Search(new LevenshtomatonFactory().Construct("1", 1)).Select(r => r.Result)
17-
.ShouldBe(["", "1", "12"], ignoreOrder: true);
14+
Test(t, "", 2, ["", "1", "12"]);
15+
Test(t, "1", 1, ["", "1", "12"]);
1816
}
1917

2018
[Fact]
2119
public void Food()
2220
{
23-
string[] entries = ["f", "food", "good", "mood", "flood", "fod", "fob", "foodie", "\U0002f971"];
21+
string[] entries = ["mood", "f", "food", "good", "dood", "flood", "fod", "fob", "foodie", "\U0002f971"];
2422
var t = Levenshtrie<string>.Create(entries.Select(e => new KeyValuePair<string, string>(e, e)));
2523

26-
t.Search(new LevenshtomatonFactory().Construct("food", 0)).Select(r => r.Result)
27-
.ShouldBe(["food"], ignoreOrder: true);
28-
29-
t.Search(new LevenshtomatonFactory().Construct("food", 1)).Select(r => r.Result)
30-
.ShouldBe(["food", "good", "mood", "flood", "fod"], ignoreOrder: true);
31-
32-
t.Search(new LevenshtomatonFactory().Construct("food", 2)).Select(r => r.Result)
33-
.ShouldBe(["food", "good", "mood", "flood", "fod", "fob", "foodie"], ignoreOrder: true);
34-
35-
t.Search(new LevenshtomatonFactory().Construct("\U0001f970", 1)).Select(r => r.Result)
36-
.ShouldBe(["f", "\U0002f971"], ignoreOrder: true);
24+
Test(t, "food", 0, ["food"]);
25+
Test(t, "food", 1, ["food", "good", "dood", "mood", "flood", "fod"]);
26+
Test(t, "food", 2, ["food", "good", "dood", "mood", "flood", "fod", "fob", "foodie"]);
27+
Test(t, "\U0001f970", 1, ["f", "\U0002f971"]);
3728
}
3829

3930
[Fact]
@@ -56,22 +47,10 @@ void RunTest(string word)
5647
{
5748
var search = FindWordsWithinNDistance(word, 3);
5849

59-
levenshtrie
60-
.Search(word, 0)
61-
.ShouldBe(search.Where(x => x.distance <= 0).Select(x => new LevenshtrieSearchResult<string>(x.distance, x.word)), ignoreOrder: true);
62-
63-
levenshtrie.
64-
Search(word, 1)
65-
.ShouldBe(search.Where(x => x.distance <= 1).Select(x => new LevenshtrieSearchResult<string>(x.distance, x.word)), ignoreOrder: true);
66-
67-
levenshtrie
68-
.Search(word, 2)
69-
.ShouldBe(search.Where(x => x.distance <= 2).Select(x => new LevenshtrieSearchResult<string>(x.distance, x.word)), ignoreOrder: true);
70-
71-
levenshtrie
72-
.Search(word, 3)
73-
.ShouldBe(search.Where(x => x.distance <= 3).Select(x => new LevenshtrieSearchResult<string>(x.distance, x.word)), ignoreOrder: true);
74-
50+
Test(levenshtrie, word, 0, search.Where(x => x.distance <= 0));
51+
Test(levenshtrie, word, 1, search.Where(x => x.distance <= 1));
52+
Test(levenshtrie, word, 2, search.Where(x => x.distance <= 2));
53+
Test(levenshtrie, word, 3, search.Where(x => x.distance <= 3));
7554
}
7655

7756
IReadOnlyList<(string word, int distance)> FindWordsWithinNDistance(string query, int maxEditDistance)
@@ -97,8 +76,7 @@ public void StackOverflow_Scenario1()
9776
string[] entries = [new string('a', 10_000) + 'a', new string('a', 10_000) + 'b'];
9877
var t = Levenshtrie<string>.Create(entries.Select(e => new KeyValuePair<string, string>(e, e)));
9978

100-
t.Search(entries[0], maxEditDistance: 1).Select(r => r.Result)
101-
.ShouldBe(entries);
79+
Test(t, entries[0], 1, [(entries[1], 1), (entries[0], 0)]);
10280
}
10381

10482
[Fact]
@@ -112,7 +90,37 @@ public void StackOverflow_Scenario2()
11290

11391
var t = Levenshtrie<string>.Create(entries.Select(e => new KeyValuePair<string, string>(e, e)));
11492

115-
t.Search(entries.Last(), maxEditDistance: 1).Select(r => r.Result)
116-
.ShouldBe(entries[^2..]);
93+
Test(t, entries[^1], 1, [(entries[^2], 1), (entries[^1], 0)]);
94+
}
95+
96+
private static void Test(Levenshtrie<string> t, string query, int distance, IEnumerable<string> expected)
97+
{
98+
Test(t, query, distance, expected.Select(e => (e, LevenshteinDistance.Calculate(query, e))));
99+
}
100+
101+
private static void Test(Levenshtrie<string> t, string query, int distance, IEnumerable<(string word, int distance)> expected)
102+
{
103+
var expectedResults = expected.Select(e => new LevenshtrieSearchResult<string>(e.distance, e.word));
104+
105+
t.Search(query, distance)
106+
.ShouldBe(expectedResults, ignoreOrder: true, comparer: new LevenshtrieSearchResultComparer<string>());
107+
108+
t.EnumerateSearch(query, distance)
109+
.ShouldBe(expectedResults, ignoreOrder: true, comparer: new LevenshtrieSearchResultComparer<string>());
110+
}
111+
112+
private class LevenshtrieSearchResultComparer<T> : IEqualityComparer<LevenshtrieSearchResult<T>>
113+
{
114+
public bool Equals(LevenshtrieSearchResult<T> x, LevenshtrieSearchResult<T> y)
115+
=> x.Distance == y.Distance
116+
&& (x.Result?.Equals(y.Result) ?? (y.Result is null));
117+
118+
public int GetHashCode([DisallowNull] LevenshtrieSearchResult<T> obj)
119+
{
120+
var hashCode = new HashCode();
121+
hashCode.Add(obj.Distance);
122+
hashCode.Add(obj.Result);
123+
return hashCode.ToHashCode();
124+
}
117125
}
118126
}

0 commit comments

Comments
 (0)