FelipeLuz
diff --git a/‎src/DotnetBadWordDetector/Data/bad-words-model-portuguese.zip
86.7 KB b/‎src/DotnetBadWordDetector/Data/bad-words-model-portuguese.zip
86.7 KB
diff --git a/‎src/DotnetBadWordDetector/Data/bad-words-model-russian_cyrillic.zip
44.4 KB b/‎src/DotnetBadWordDetector/Data/bad-words-model-russian_cyrillic.zip
44.4 KB
diff --git a/‎src/DotnetBadWordDetector/Data/bad-words-model-spanish.zip
74 KB b/‎src/DotnetBadWordDetector/Data/bad-words-model-spanish.zip
74 KB
diff --git a/‎src/DotnetBadWordDetector/DotnetBadWordDetector.csproj
Lines changed: 7 additions & 4 deletions b/‎src/DotnetBadWordDetector/DotnetBadWordDetector.csproj
Lines changed: 7 additions & 4 deletions
diff --git a/‎src/DotnetBadWordDetector/Model/Locales.cs
Lines changed: 22 additions & 0 deletions b/‎src/DotnetBadWordDetector/Model/Locales.cs
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/DotnetBadWordDetector/ProfanityDetector.cs
Lines changed: 117 additions & 16 deletions b/‎src/DotnetBadWordDetector/ProfanityDetector.cs
Lines changed: 117 additions & 16 deletions
@@ -2,12 +2,12 @@
 
     <PropertyGroup>
         <PackageId>DotnetBadWordDetector</PackageId>
-        <Version>1.0.1</Version>
+        <Version>2.0.0</Version>
         <Authors>Felipe Luz</Authors>
         <PackageDescription>Dotnet Bad Word Detector</PackageDescription>
         <RepositoryUrl>https://github.com/FelipeLuz/dotnet-bad-word-detector</RepositoryUrl>
-        <TargetFramework>netcoreapp3.1</TargetFramework>
-        <LangVersion>10.0</LangVersion>
+        <TargetFramework>net8.0</TargetFramework>
+        <LangVersion>12.0</LangVersion>
         <AssemblyName>DotnetBadWordDetector</AssemblyName>
         <RootNamespace>DotnetBadWordDetector</RootNamespace>
         <ImplicitUsings>enable</ImplicitUsings>
@@ -19,5 +19,8 @@
 
     <ItemGroup>
         <EmbeddedResource Include="Data/bad-words-model-english.zip" />
+        <EmbeddedResource Include="Data/bad-words-model-portuguese.zip" />
+        <EmbeddedResource Include="Data/bad-words-model-russian_cyrillic.zip" />
+        <EmbeddedResource Include="Data/bad-words-model-spanish.zip" />
     </ItemGroup>
-</Project>
+</Project>
@@ -0,0 +1,22 @@
+using System.ComponentModel;
+using System.Reflection;
+
+namespace DotnetBadWordDetector.Model;
+
+public enum Locales
+{
+	ENGLISH,
+	SPANISH,
+	PORTUGUESE,
+	RUSSIAN_CYRILLIC
+}
+
+public static class EnumExtensions
+{
+    public static string GetDescription(this Enum value)
+    {
+        var field = value.GetType().GetField(value.ToString());
+        var attribute = Attribute.GetCustomAttribute(field, typeof(DescriptionAttribute)) as DescriptionAttribute;
+        return attribute == null ? value.ToString().ToLowerInvariant() : attribute.Description.ToLowerInvariant();
+    }
+}
@@ -1,30 +1,54 @@
 using Microsoft.ML;
 using DotnetBadWordDetector.Model;
+using System.Text.RegularExpressions;
 
 namespace DotnetBadWordDetector;
 public class ProfanityDetector
 {
+    const string MODEL_BASE_PATH = "DotnetBadWordDetector.Data.bad-words-model-{LOCALE}.zip";
+    const string TAG = "{LOCALE}";
+    List<PredictionEngine<BadWord, BadWordPrediction>> _engines = new List<PredictionEngine<BadWord, BadWordPrediction>>();
 
-    private const string MODELPATH = "DotnetBadWordDetector.Data.bad-words-model-english.zip";
-    private PredictionEngine<BadWord, BadWordPrediction> _predictionEngine;
-    
-    public ProfanityDetector()
+    /// <summary>
+    /// Default constructor to initialize the profanity detector with English locale or all locales
+    /// Consider that different locales may cause false-positives on other languages, so use with caution. 
+    /// </summary>
+    /// <param name="allLocales"></param>
+    public ProfanityDetector(bool allLocales = false)
+    {
+
+        Locales[] locales = allLocales ? [Locales.ENGLISH, Locales.SPANISH, Locales.PORTUGUESE]
+                                        : [Locales.ENGLISH];
+        LoadTrainedModel(locales);
+    }
+
+    /// <summary>
+    /// Constructor to initialize the profanity detector with specific locales.
+    /// Consider that different locales may cause false-positives on other languages, so use with caution. 
+    /// </summary>
+    /// <param name="locales"></param> <summary>
+    public ProfanityDetector(params Locales[] locales)
     {
-        LoadTrainedModel();
+        LoadTrainedModel(locales);
     }
 
-    private void LoadTrainedModel()
+    private void LoadTrainedModel(Locales[] locales)
     {   
-        DataViewSchema modelSchema;
         var mlContext = new MLContext();
-        var trainedModel = mlContext.Model.Load(GetModelStream(), out modelSchema);
-        _predictionEngine = mlContext.Model.CreatePredictionEngine<BadWord, BadWordPrediction>(trainedModel);
+        foreach (var locale in locales)
+        {
+            var path = MODEL_BASE_PATH.Replace(TAG, locale.GetDescription());
+            var stream = GetModelStream(path);
+            var trainedModel = mlContext.Model.Load(stream, out _);
+            var engine = mlContext.Model.CreatePredictionEngine<BadWord, BadWordPrediction>(trainedModel);
+            _engines.Add(engine);
+        }
     }
 
-    private Stream GetModelStream()
+    private Stream GetModelStream(string path)
     {
-        var assembly = typeof(DotnetBadWordDetector.ProfanityDetector).Assembly;
-        return assembly.GetManifestResourceStream(MODELPATH);
+        var assembly = typeof(ProfanityDetector).Assembly;
+        return assembly.GetManifestResourceStream(path);
     }
 
     /// <summary>
@@ -34,8 +58,51 @@ private Stream GetModelStream()
     /// <returns>true if classified as profane</returns>
     public bool IsProfane(string word)
     {
-        var obj = new BadWord { Word = word };
-        return _predictionEngine.Predict(obj).Prediction;
+        var cleanWord = Regex.Replace(word, @"[^a-zA-Z0-9\s@]", "");
+        var obj = new BadWord { Word = cleanWord };
+        foreach (var engine in _engines)
+        {
+            if (engine.Predict(obj).Prediction)
+               return true;
+        }
+
+        return false;
+    }
+
+    /// <summary>
+    /// Checks if a phrase contains any profane words
+    /// </summary>
+    /// <param name="phrase"></param>
+    /// <returns>true if classified as profane</returns>
+    public bool IsPhraseProfane(string phrase)
+    {
+        var words = phrase.Split(' ', StringSplitOptions.RemoveEmptyEntries);
+        foreach (var word in words)
+        {
+            if (IsProfane(word))
+                return true;
+        }
+
+        return false;
+    }
+
+    /// <summary>
+    /// Gets the probability of a given sentence to contain profane words
+    /// </summary>
+    /// <param name="word"></param>
+    /// <returns> 0 < prediction < 1</returns>
+    public float GetPhraseProfanityProbability(string phrase)
+    {
+        var words = phrase.Split(' ', StringSplitOptions.RemoveEmptyEntries);
+        var biggestProb = 0f;
+
+        foreach (var word in words)
+        {
+            var currProb = GetProfanityProbability(word);
+            biggestProb = Math.Max(currProb, biggestProb);
+        }
+
+        return biggestProb;
     }
 
     /// <summary>
@@ -45,7 +112,41 @@ public bool IsProfane(string word)
     /// <returns> 0 < prediction < 1</returns>
     public float GetProfanityProbability(string word)
     {
-        var obj = new BadWord { Word = word };
-        return _predictionEngine.Predict(obj).Probability;
+        var cleanWord = Regex.Replace(word, @"[^a-zA-Z0-9\s@]", "");
+        var obj = new BadWord { Word = cleanWord };
+        var biggestProb = 0f;
+
+        foreach (var engine in _engines)
+        {
+            var currProb = engine.Predict(obj).Probability;
+            biggestProb = Math.Max(currProb, biggestProb);
+        }
+
+        return biggestProb;
+    }
+
+    /// <summary>
+    /// Masks the profane words in a phrase with a given character
+    /// </summary>
+    /// <param name="phrase"></param>
+    /// <param name="maskChar"></param>
+    /// <returns>Phrase with masked bad words</returns>
+    public string MaskProfanity(string phrase, char maskChar = '*')
+    {
+        var words = phrase.Split(' ', StringSplitOptions.RemoveEmptyEntries);
+        var censoredWords = new List<string>();
+
+        foreach (var word in words)
+        {
+            if (IsProfane(word))
+            {
+                var maskedWord = new string(maskChar, word.Length);
+                censoredWords.Add(maskedWord);
+            }
+            else
+                censoredWords.Add(word);
+        }
+
+        return string.Join(' ', censoredWords);
     }
 }