11using Microsoft . ML ;
22using DotnetBadWordDetector . Model ;
3+ using System . Text . RegularExpressions ;
34
45namespace DotnetBadWordDetector ;
56public class ProfanityDetector
67{
8+ const string MODEL_BASE_PATH = "DotnetBadWordDetector.Data.bad-words-model-{LOCALE}.zip" ;
9+ const string TAG = "{LOCALE}" ;
10+ List < PredictionEngine < BadWord , BadWordPrediction > > _engines = new List < PredictionEngine < BadWord , BadWordPrediction > > ( ) ;
711
8- private const string MODELPATH = "DotnetBadWordDetector.Data.bad-words-model-english.zip" ;
9- private PredictionEngine < BadWord , BadWordPrediction > _predictionEngine ;
10-
11- public ProfanityDetector ( )
12+ /// <summary>
13+ /// Default constructor to initialize the profanity detector with English locale or all locales
14+ /// Consider that different locales may cause false-positives on other languages, so use with caution.
15+ /// </summary>
16+ /// <param name="allLocales"></param>
17+ public ProfanityDetector ( bool allLocales = false )
18+ {
19+
20+ Locales [ ] locales = allLocales ? [ Locales . ENGLISH , Locales . SPANISH , Locales . PORTUGUESE ]
21+ : [ Locales . ENGLISH ] ;
22+ LoadTrainedModel ( locales ) ;
23+ }
24+
25+ /// <summary>
26+ /// Constructor to initialize the profanity detector with specific locales.
27+ /// Consider that different locales may cause false-positives on other languages, so use with caution.
28+ /// </summary>
29+ /// <param name="locales"></param> <summary>
30+ public ProfanityDetector ( params Locales [ ] locales )
1231 {
13- LoadTrainedModel ( ) ;
32+ LoadTrainedModel ( locales ) ;
1433 }
1534
16- private void LoadTrainedModel ( )
35+ private void LoadTrainedModel ( Locales [ ] locales )
1736 {
18- DataViewSchema modelSchema ;
1937 var mlContext = new MLContext ( ) ;
20- var trainedModel = mlContext . Model . Load ( GetModelStream ( ) , out modelSchema ) ;
21- _predictionEngine = mlContext . Model . CreatePredictionEngine < BadWord , BadWordPrediction > ( trainedModel ) ;
38+ foreach ( var locale in locales )
39+ {
40+ var path = MODEL_BASE_PATH . Replace ( TAG , locale . GetDescription ( ) ) ;
41+ var stream = GetModelStream ( path ) ;
42+ var trainedModel = mlContext . Model . Load ( stream , out _ ) ;
43+ var engine = mlContext . Model . CreatePredictionEngine < BadWord , BadWordPrediction > ( trainedModel ) ;
44+ _engines . Add ( engine ) ;
45+ }
2246 }
2347
24- private Stream GetModelStream ( )
48+ private Stream GetModelStream ( string path )
2549 {
26- var assembly = typeof ( DotnetBadWordDetector . ProfanityDetector ) . Assembly ;
27- return assembly . GetManifestResourceStream ( MODELPATH ) ;
50+ var assembly = typeof ( ProfanityDetector ) . Assembly ;
51+ return assembly . GetManifestResourceStream ( path ) ;
2852 }
2953
3054 /// <summary>
@@ -34,8 +58,51 @@ private Stream GetModelStream()
3458 /// <returns>true if classified as profane</returns>
3559 public bool IsProfane ( string word )
3660 {
37- var obj = new BadWord { Word = word } ;
38- return _predictionEngine . Predict ( obj ) . Prediction ;
61+ var cleanWord = Regex . Replace ( word , @"[^a-zA-Z0-9\s@]" , "" ) ;
62+ var obj = new BadWord { Word = cleanWord } ;
63+ foreach ( var engine in _engines )
64+ {
65+ if ( engine . Predict ( obj ) . Prediction )
66+ return true ;
67+ }
68+
69+ return false ;
70+ }
71+
72+ /// <summary>
73+ /// Checks if a phrase contains any profane words
74+ /// </summary>
75+ /// <param name="phrase"></param>
76+ /// <returns>true if classified as profane</returns>
77+ public bool IsPhraseProfane ( string phrase )
78+ {
79+ var words = phrase . Split ( ' ' , StringSplitOptions . RemoveEmptyEntries ) ;
80+ foreach ( var word in words )
81+ {
82+ if ( IsProfane ( word ) )
83+ return true ;
84+ }
85+
86+ return false ;
87+ }
88+
89+ /// <summary>
90+ /// Gets the probability of a given sentence to contain profane words
91+ /// </summary>
92+ /// <param name="word"></param>
93+ /// <returns> 0 < prediction < 1</returns>
94+ public float GetPhraseProfanityProbability ( string phrase )
95+ {
96+ var words = phrase . Split ( ' ' , StringSplitOptions . RemoveEmptyEntries ) ;
97+ var biggestProb = 0f ;
98+
99+ foreach ( var word in words )
100+ {
101+ var currProb = GetProfanityProbability ( word ) ;
102+ biggestProb = Math . Max ( currProb , biggestProb ) ;
103+ }
104+
105+ return biggestProb ;
39106 }
40107
41108 /// <summary>
@@ -45,7 +112,41 @@ public bool IsProfane(string word)
45112 /// <returns> 0 < prediction < 1</returns>
46113 public float GetProfanityProbability ( string word )
47114 {
48- var obj = new BadWord { Word = word } ;
49- return _predictionEngine . Predict ( obj ) . Probability ;
115+ var cleanWord = Regex . Replace ( word , @"[^a-zA-Z0-9\s@]" , "" ) ;
116+ var obj = new BadWord { Word = cleanWord } ;
117+ var biggestProb = 0f ;
118+
119+ foreach ( var engine in _engines )
120+ {
121+ var currProb = engine . Predict ( obj ) . Probability ;
122+ biggestProb = Math . Max ( currProb , biggestProb ) ;
123+ }
124+
125+ return biggestProb ;
126+ }
127+
128+ /// <summary>
129+ /// Masks the profane words in a phrase with a given character
130+ /// </summary>
131+ /// <param name="phrase"></param>
132+ /// <param name="maskChar"></param>
133+ /// <returns>Phrase with masked bad words</returns>
134+ public string MaskProfanity ( string phrase , char maskChar = '*' )
135+ {
136+ var words = phrase . Split ( ' ' , StringSplitOptions . RemoveEmptyEntries ) ;
137+ var censoredWords = new List < string > ( ) ;
138+
139+ foreach ( var word in words )
140+ {
141+ if ( IsProfane ( word ) )
142+ {
143+ var maskedWord = new string ( maskChar , word . Length ) ;
144+ censoredWords . Add ( maskedWord ) ;
145+ }
146+ else
147+ censoredWords . Add ( word ) ;
148+ }
149+
150+ return string . Join ( ' ' , censoredWords ) ;
50151 }
51152}
0 commit comments