Skip to content

Commit 7f76c64

Browse files
Merge pull request #17 from dataiku/bug/remove-spacymoji
Remove spacymoji and avoid loading stopwords if not required
2 parents 8d93fd7 + 7ae672f commit 7f76c64

File tree

4 files changed

+11
-10
lines changed

4 files changed

+11
-10
lines changed

code-env/python/spec/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ jieba==0.42.1
55
pyvi==0.1
66
regex==2020.11.13
77
spacy[lookups,th]==2.3.5
8-
spacymoji==2.0.0
8+
emoji==1.2.0
99
tqdm==4.50.2
1010
matplotlib==3.3.1
1111
wordcloud==1.8.0

custom-recipes/nlp-visualization-wordcloud/recipe.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,13 @@
1717
output_partition_path = params["output_partition_path"]
1818
df = params["df"]
1919

20+
# Instanciate tokenizer
21+
tokenizer = MultilingualTokenizer(
22+
stopwords_folder_path=(params["stopwords_folder_path"] if params["remove_stopwords"] else None)
23+
)
2024
# Load wordcloud visualizer
2125
worcloud_visualizer = WordcloudVisualizer(
22-
tokenizer=MultilingualTokenizer(stopwords_folder_path=params["stopwords_folder_path"]),
26+
tokenizer=tokenizer,
2327
text_column=params["text_column"],
2428
font_folder_path=font_folder_path,
2529
language=params["language"],

python-lib/spacy_tokenizer.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from spacy.language import Language
1515
from spacy.tokens import Doc, Token
1616
from spacy.vocab import Vocab
17-
from spacymoji import Emoji
1817
from emoji import UNICODE_EMOJI
1918
from fastcore.utils import store_attr
2019

@@ -176,10 +175,6 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
176175
nlp.tokenizer.prefix_search = spacy.util.compile_prefix_regex(_prefixes).search
177176
if self.stopwords_folder_path and language in SUPPORTED_LANGUAGES_SPACY:
178177
self._customize_stopwords(nlp, language)
179-
try:
180-
nlp.add_pipe(Emoji(nlp), first=True)
181-
except (AttributeError, ValueError) as e:
182-
logging.warning(f"Spacymoji not available for language '{language}' because of error: '{e}'")
183178
logging.info(f"Loading tokenizer for language '{language}': done in {perf_counter() - start:.2f} seconds")
184179
return nlp
185180

tests/python/integration/test_wordcloud.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,23 @@ def test_wordcloud_multilingual_subcharts(user_dss_clients):
2222

2323

2424
def test_wordcloud_multilingual_subcharts_unsupported_languages(user_dss_clients):
25-
dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="multilingual_subcharts_unsupported_languages")
25+
dss_scenario.run(
26+
user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="multilingual_subcharts_unsupported_languages"
27+
)
2628

2729

2830
def test_wordcloud_multilingual_subcharts_per_language(user_dss_clients):
2931
dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="subchart_per_language")
3032

3133

32-
def test_wordcloud_edge_cases_multilingual(user_clients):
34+
def test_wordcloud_edge_cases_multilingual(user_dss_clients):
3335
dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="EDGE_CASES")
3436

3537

3638
def test_wordcloud_partitioned_folder_file(user_dss_clients):
3739
dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="partitionned_folder_file")
3840

39-
41+
4042
def test_wordcloud_partitioned_folder_sql(user_dss_clients):
4143
dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="partitionned_folder_sql")
4244

0 commit comments

Comments
 (0)