Skip to content

Commit 7704b1c

Browse files
authored
Merge pull request #5 from FelipeLuz/feature/add-spanish
WIP: Add initial support to spanish language
2 parents 1c5ef98 + e6ab02e commit 7704b1c

File tree

6 files changed

+146
-20
lines changed

6 files changed

+146
-20
lines changed
Binary file not shown.
Binary file not shown.
Binary file not shown.

src/DotnetBadWordDetector/DotnetBadWordDetector.csproj

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22

33
<PropertyGroup>
44
<PackageId>DotnetBadWordDetector</PackageId>
5-
<Version>1.0.1</Version>
5+
<Version>2.0.0</Version>
66
<Authors>Felipe Luz</Authors>
77
<PackageDescription>Dotnet Bad Word Detector</PackageDescription>
88
<RepositoryUrl>https://github.com/FelipeLuz/dotnet-bad-word-detector</RepositoryUrl>
9-
<TargetFramework>netcoreapp3.1</TargetFramework>
10-
<LangVersion>10.0</LangVersion>
9+
<TargetFramework>net8.0</TargetFramework>
10+
<LangVersion>12.0</LangVersion>
1111
<AssemblyName>DotnetBadWordDetector</AssemblyName>
1212
<RootNamespace>DotnetBadWordDetector</RootNamespace>
1313
<ImplicitUsings>enable</ImplicitUsings>
@@ -19,5 +19,8 @@
1919

2020
<ItemGroup>
2121
<EmbeddedResource Include="Data/bad-words-model-english.zip" />
22+
<EmbeddedResource Include="Data/bad-words-model-portuguese.zip" />
23+
<EmbeddedResource Include="Data/bad-words-model-russian_cyrillic.zip" />
24+
<EmbeddedResource Include="Data/bad-words-model-spanish.zip" />
2225
</ItemGroup>
23-
</Project>
26+
</Project>
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
using System.ComponentModel;
2+
using System.Reflection;
3+
4+
namespace DotnetBadWordDetector.Model;
5+
6+
public enum Locales
7+
{
8+
ENGLISH,
9+
SPANISH,
10+
PORTUGUESE,
11+
RUSSIAN_CYRILLIC
12+
}
13+
14+
public static class EnumExtensions
15+
{
16+
public static string GetDescription(this Enum value)
17+
{
18+
var field = value.GetType().GetField(value.ToString());
19+
var attribute = Attribute.GetCustomAttribute(field, typeof(DescriptionAttribute)) as DescriptionAttribute;
20+
return attribute == null ? value.ToString().ToLowerInvariant() : attribute.Description.ToLowerInvariant();
21+
}
22+
}
Lines changed: 117 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,54 @@
11
using Microsoft.ML;
22
using DotnetBadWordDetector.Model;
3+
using System.Text.RegularExpressions;
34

45
namespace DotnetBadWordDetector;
56
public class ProfanityDetector
67
{
8+
const string MODEL_BASE_PATH = "DotnetBadWordDetector.Data.bad-words-model-{LOCALE}.zip";
9+
const string TAG = "{LOCALE}";
10+
List<PredictionEngine<BadWord, BadWordPrediction>> _engines = new List<PredictionEngine<BadWord, BadWordPrediction>>();
711

8-
private const string MODELPATH = "DotnetBadWordDetector.Data.bad-words-model-english.zip";
9-
private PredictionEngine<BadWord, BadWordPrediction> _predictionEngine;
10-
11-
public ProfanityDetector()
12+
/// <summary>
13+
/// Default constructor to initialize the profanity detector with English locale or all locales
14+
/// Consider that different locales may cause false-positives on other languages, so use with caution.
15+
/// </summary>
16+
/// <param name="allLocales"></param>
17+
public ProfanityDetector(bool allLocales = false)
18+
{
19+
20+
Locales[] locales = allLocales ? [Locales.ENGLISH, Locales.SPANISH, Locales.PORTUGUESE]
21+
: [Locales.ENGLISH];
22+
LoadTrainedModel(locales);
23+
}
24+
25+
/// <summary>
26+
/// Constructor to initialize the profanity detector with specific locales.
27+
/// Consider that different locales may cause false-positives on other languages, so use with caution.
28+
/// </summary>
29+
/// <param name="locales"></param> <summary>
30+
public ProfanityDetector(params Locales[] locales)
1231
{
13-
LoadTrainedModel();
32+
LoadTrainedModel(locales);
1433
}
1534

16-
private void LoadTrainedModel()
35+
private void LoadTrainedModel(Locales[] locales)
1736
{
18-
DataViewSchema modelSchema;
1937
var mlContext = new MLContext();
20-
var trainedModel = mlContext.Model.Load(GetModelStream(), out modelSchema);
21-
_predictionEngine = mlContext.Model.CreatePredictionEngine<BadWord, BadWordPrediction>(trainedModel);
38+
foreach (var locale in locales)
39+
{
40+
var path = MODEL_BASE_PATH.Replace(TAG, locale.GetDescription());
41+
var stream = GetModelStream(path);
42+
var trainedModel = mlContext.Model.Load(stream, out _);
43+
var engine = mlContext.Model.CreatePredictionEngine<BadWord, BadWordPrediction>(trainedModel);
44+
_engines.Add(engine);
45+
}
2246
}
2347

24-
private Stream GetModelStream()
48+
private Stream GetModelStream(string path)
2549
{
26-
var assembly = typeof(DotnetBadWordDetector.ProfanityDetector).Assembly;
27-
return assembly.GetManifestResourceStream(MODELPATH);
50+
var assembly = typeof(ProfanityDetector).Assembly;
51+
return assembly.GetManifestResourceStream(path);
2852
}
2953

3054
/// <summary>
@@ -34,8 +58,51 @@ private Stream GetModelStream()
3458
/// <returns>true if classified as profane</returns>
3559
public bool IsProfane(string word)
3660
{
37-
var obj = new BadWord { Word = word };
38-
return _predictionEngine.Predict(obj).Prediction;
61+
var cleanWord = Regex.Replace(word, @"[^a-zA-Z0-9\s@]", "");
62+
var obj = new BadWord { Word = cleanWord };
63+
foreach (var engine in _engines)
64+
{
65+
if (engine.Predict(obj).Prediction)
66+
return true;
67+
}
68+
69+
return false;
70+
}
71+
72+
/// <summary>
73+
/// Checks if a phrase contains any profane words
74+
/// </summary>
75+
/// <param name="phrase"></param>
76+
/// <returns>true if classified as profane</returns>
77+
public bool IsPhraseProfane(string phrase)
78+
{
79+
var words = phrase.Split(' ', StringSplitOptions.RemoveEmptyEntries);
80+
foreach (var word in words)
81+
{
82+
if (IsProfane(word))
83+
return true;
84+
}
85+
86+
return false;
87+
}
88+
89+
/// <summary>
90+
/// Gets the probability of a given sentence to contain profane words
91+
/// </summary>
92+
/// <param name="word"></param>
93+
/// <returns> 0 < prediction < 1</returns>
94+
public float GetPhraseProfanityProbability(string phrase)
95+
{
96+
var words = phrase.Split(' ', StringSplitOptions.RemoveEmptyEntries);
97+
var biggestProb = 0f;
98+
99+
foreach (var word in words)
100+
{
101+
var currProb = GetProfanityProbability(word);
102+
biggestProb = Math.Max(currProb, biggestProb);
103+
}
104+
105+
return biggestProb;
39106
}
40107

41108
/// <summary>
@@ -45,7 +112,41 @@ public bool IsProfane(string word)
45112
/// <returns> 0 < prediction < 1</returns>
46113
public float GetProfanityProbability(string word)
47114
{
48-
var obj = new BadWord { Word = word };
49-
return _predictionEngine.Predict(obj).Probability;
115+
var cleanWord = Regex.Replace(word, @"[^a-zA-Z0-9\s@]", "");
116+
var obj = new BadWord { Word = cleanWord };
117+
var biggestProb = 0f;
118+
119+
foreach (var engine in _engines)
120+
{
121+
var currProb = engine.Predict(obj).Probability;
122+
biggestProb = Math.Max(currProb, biggestProb);
123+
}
124+
125+
return biggestProb;
126+
}
127+
128+
/// <summary>
129+
/// Masks the profane words in a phrase with a given character
130+
/// </summary>
131+
/// <param name="phrase"></param>
132+
/// <param name="maskChar"></param>
133+
/// <returns>Phrase with masked bad words</returns>
134+
public string MaskProfanity(string phrase, char maskChar = '*')
135+
{
136+
var words = phrase.Split(' ', StringSplitOptions.RemoveEmptyEntries);
137+
var censoredWords = new List<string>();
138+
139+
foreach (var word in words)
140+
{
141+
if (IsProfane(word))
142+
{
143+
var maskedWord = new string(maskChar, word.Length);
144+
censoredWords.Add(maskedWord);
145+
}
146+
else
147+
censoredWords.Add(word);
148+
}
149+
150+
return string.Join(' ', censoredWords);
50151
}
51152
}

0 commit comments

Comments
 (0)