Skip to content

Commit 87abdd3

Browse files
committed
Integrated some operations into main loop
1 parent 73219d8 commit 87abdd3

File tree

4 files changed

+231
-21
lines changed

4 files changed

+231
-21
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "simple_unicode_normalization_forms"
3-
version = "0.1.0"
3+
version = "0.1.1"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

src/emoji.rs

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
2+
("Emoji_Modifier", EMOJI_MODIFIER),
3+
("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE),
4+
("Emoji_Presentation", EMOJI_PRESENTATION),
5+
];
6+
7+
pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('🏻', '🏿')];
8+
pub const EXTRA_CHARS: &'static [(char, char)] =
9+
&[('\u{FE0E}', '\u{FE0F}'), ('\u{20E2}', '\u{20E4}')];
10+
11+
pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[
12+
('☝', '☝'),
13+
('⛹', '⛹'),
14+
('✊', '✍'),
15+
('🎅', '🎅'),
16+
('🏂', '🏄'),
17+
('🏇', '🏇'),
18+
('🏊', '🏌'),
19+
('👂', '👃'),
20+
('👆', '👐'),
21+
('👦', '👸'),
22+
('👼', '👼'),
23+
('💁', '💃'),
24+
('💅', '💇'),
25+
('💏', '💏'),
26+
('💑', '💑'),
27+
('💪', '💪'),
28+
('🕴', '🕵'),
29+
('🕺', '🕺'),
30+
('🖐', '🖐'),
31+
('🖕', '🖖'),
32+
('🙅', '🙇'),
33+
('🙋', '🙏'),
34+
('🚣', '🚣'),
35+
('🚴', '🚶'),
36+
('🛀', '🛀'),
37+
('🛌', '🛌'),
38+
('🤌', '🤌'),
39+
('🤏', '🤏'),
40+
('🤘', '🤟'),
41+
('🤦', '🤦'),
42+
('🤰', '🤹'),
43+
('🤼', '🤾'),
44+
('🥷', '🥷'),
45+
('🦵', '🦶'),
46+
('🦸', '🦹'),
47+
('🦻', '🦻'),
48+
('🧍', '🧏'),
49+
('🧑', '🧝'),
50+
('🫃', '🫅'),
51+
('🫰', '🫸'),
52+
];
53+
54+
pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[
55+
('⌚', '⌛'),
56+
('⏩', '⏬'),
57+
('⏰', '⏰'),
58+
('⏳', '⏳'),
59+
('◽', '◾'),
60+
('☔', '☕'),
61+
('♈', '♓'),
62+
('♿', '♿'),
63+
('⚓', '⚓'),
64+
('⚡', '⚡'),
65+
('⚪', '⚫'),
66+
('⚽', '⚾'),
67+
('⛄', '⛅'),
68+
('⛎', '⛎'),
69+
('⛔', '⛔'),
70+
('⛪', '⛪'),
71+
('⛲', '⛳'),
72+
('⛵', '⛵'),
73+
('⛺', '⛺'),
74+
('⛽', '⛽'),
75+
('✅', '✅'),
76+
('✊', '✋'),
77+
('✨', '✨'),
78+
('❌', '❌'),
79+
('❎', '❎'),
80+
('❓', '❕'),
81+
('❗', '❗'),
82+
('➕', '➗'),
83+
('➰', '➰'),
84+
('➿', '➿'),
85+
('⬛', '⬜'),
86+
('⭐', '⭐'),
87+
('⭕', '⭕'),
88+
('🀄', '🀄'),
89+
('🃏', '🃏'),
90+
('🆎', '🆎'),
91+
('🆑', '🆚'),
92+
('🇦', '🇿'),
93+
('🈁', '🈁'),
94+
('🈚', '🈚'),
95+
('🈯', '🈯'),
96+
('🈲', '🈶'),
97+
('🈸', '🈺'),
98+
('🉐', '🉑'),
99+
('🌀', '🌠'),
100+
('🌭', '🌵'),
101+
('🌷', '🍼'),
102+
('🍾', '🎓'),
103+
('🎠', '🏊'),
104+
('🏏', '🏓'),
105+
('🏠', '🏰'),
106+
('🏴', '🏴'),
107+
('🏸', '🐾'),
108+
('👀', '👀'),
109+
('👂', '📼'),
110+
('📿', '🔽'),
111+
('🕋', '🕎'),
112+
('🕐', '🕧'),
113+
('🕺', '🕺'),
114+
('🖕', '🖖'),
115+
('🖤', '🖤'),
116+
('🗻', '🙏'),
117+
('🚀', '🛅'),
118+
('🛌', '🛌'),
119+
('🛐', '🛒'),
120+
('🛕', '🛗'),
121+
('🛜', '🛟'),
122+
('🛫', '🛬'),
123+
('🛴', '🛼'),
124+
('🟠', '🟫'),
125+
('🟰', '🟰'),
126+
('🤌', '🤺'),
127+
('🤼', '🥅'),
128+
('🥇', '🧿'),
129+
('🩰', '🩼'),
130+
('🪀', '🪈'),
131+
('🪐', '🪽'),
132+
('🪿', '🫅'),
133+
('🫎', '🫛'),
134+
('🫠', '🫨'),
135+
('🫰', '🫸'),
136+
];
137+
138+
pub trait IsEmoji {
139+
fn is_emoji(&self) -> bool;
140+
}
141+
impl IsEmoji for char {
142+
fn is_emoji(&self) -> bool {
143+
for (lc, hc) in EMOJI_PRESENTATION {
144+
if self >= lc && self <= hc {
145+
return true;
146+
}
147+
}
148+
for (lc, hc) in EMOJI_MODIFIER {
149+
if self >= lc && self <= hc {
150+
return true;
151+
}
152+
}
153+
for (lc, hc) in EMOJI_MODIFIER_BASE {
154+
if self >= lc && self <= hc {
155+
return true;
156+
}
157+
}
158+
for (lc, hc) in EXTRA_CHARS {
159+
if self >= lc && self <= hc {
160+
return true;
161+
}
162+
}
163+
false
164+
}
165+
}

src/lib.rs

Lines changed: 64 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,78 @@
11
// Copyright (c) 2024 Future Internet Consulting and Development Solutions S.L.
2+
mod emoji;
23

4+
use emoji::IsEmoji;
35
use lazy_static::lazy_static;
4-
use regex::Regex;
56
use pyo3::prelude::*;
6-
use std::collections::HashSet;
7+
use regex::Regex;
78
use unicode_normalization::char::decompose_compatible;
89
use unicode_normalization::UnicodeNormalization;
910

1011
lazy_static! {
11-
static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap();
1212
static ref EMOJI_RE: Regex = Regex::new(r"[\p{Emoji_Presentation}\p{Emoji_Modifier}\p{Emoji_Modifier_Base}\{Cc}\uFE0E\uFE0F\u20E2\u20E3\u20E4]").unwrap();
1313
}
1414

1515
/// Gives the normalized form of a string skipping some characters.
16-
fn nfkc_normalization(str: String, allow_chars: HashSet<char>) -> String {
16+
fn custom_normalization(
17+
str: String,
18+
allow_chars: Vec<char>,
19+
collapse_whitespace: bool,
20+
remove_emojis: bool,
21+
) -> String {
1722
let mut result = String::with_capacity(str.len());
23+
let mut previous_whitespace = false;
1824
for c in str.chars() {
19-
if allow_chars.contains(&c) {
20-
result.push(c)
25+
custom_character_normalization(
26+
&mut result,
27+
c,
28+
&allow_chars,
29+
collapse_whitespace,
30+
previous_whitespace,
31+
remove_emojis,
32+
);
33+
previous_whitespace = c.is_whitespace();
34+
}
35+
result.nfc().collect::<String>()
36+
}
37+
38+
fn custom_character_normalization(
39+
str: &mut String,
40+
c: char,
41+
allow_chars: &Vec<char>,
42+
collapse_whitespace: bool,
43+
previous_whitespace: bool,
44+
remove_emojis: bool,
45+
) {
46+
if allow_chars.contains(&c) {
47+
str.push(c)
48+
} else if c.is_whitespace() {
49+
if collapse_whitespace && previous_whitespace {
50+
return;
2151
} else {
22-
decompose_compatible(c, |r| {
23-
// Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set
24-
if r <= '\u{FFFF}' {
25-
result.push(r)
26-
}
27-
})
52+
str.push(' ')
2853
}
54+
} else if remove_emojis && c.is_emoji() {
55+
return;
56+
} else {
57+
decompose_compatible(c, |r| {
58+
// Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set
59+
if r <= '\u{FFFF}' {
60+
str.push(r)
61+
}
62+
})
2963
}
30-
31-
result.nfc().collect::<String>()
3264
}
3365

3466
#[pyfunction]
3567
fn basic_string_clean(value: String) -> PyResult<String> {
36-
Ok(nfkc_normalization(value, HashSet::from(['º', 'ª'])).trim().to_string())
68+
Ok(custom_normalization(value, vec!['º', 'ª'], false, false)
69+
.trim()
70+
.to_string())
3771
}
3872

3973
#[pyfunction]
4074
fn remove_emojis(value: String) -> PyResult<String> {
41-
let cleaned_value = nfkc_normalization(value, HashSet::from(['º', 'ª']));
42-
let whitespace_cleaned_value = WHITESPACE_RE.replace_all(&cleaned_value, " ");
43-
let result = EMOJI_RE.replace_all(&whitespace_cleaned_value, "");
44-
75+
let result = custom_normalization(value, vec!['º', 'ª'], true, true);
4576
Ok(result.trim().to_string())
4677
}
4778

@@ -52,3 +83,17 @@ fn simple_unicode_normalization_forms(m: &Bound<'_, PyModule>) -> PyResult<()> {
5283
m.add_function(wrap_pyfunction!(remove_emojis, m)?)?;
5384
Ok(())
5485
}
86+
87+
#[cfg(test)]
88+
mod tests {
89+
use super::remove_emojis;
90+
use std::time::Instant;
91+
92+
#[test]
93+
fn timeit() {
94+
let t1 = Instant::now();
95+
remove_emojis(" a\t name with ❤️✳️0️⃣#️⃣ #©*1 ".to_string());
96+
let t2 = Instant::now();
97+
println!("{:?}", t2 - t1);
98+
}
99+
}

0 commit comments

Comments
 (0)