Skip to content

Commit 364d4f5

Browse files
authored
Merge pull request #9 from LlmKira/dev-3
✨ feat(app): add MD5 verification for FastText model integrity
2 parents 2de37bb + f08eeaa commit 364d4f5

File tree

4 files changed

+74
-28
lines changed

4 files changed

+74
-28
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "fast-langdetect"
3-
version = "0.2.3"
3+
version = "0.2.4"
44
description = "Quickly detect text language and segment language"
55
authors = [
66
{ name = "sudoskys", email = "coldlando@hotmail.com" },

src/fast_langdetect/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# -*- coding: utf-8 -*-
22

33

4-
from .ft_detect import detect, detect_language, detect_langs, detect_multilingual # noqa: F401
4+
from .ft_detect import detect, detect_language, detect_multilingual # noqa: F401

src/fast_langdetect/ft_detect/__init__.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# -*- coding: utf-8 -*-
22
# @Time : 2024/1/17 下午4:00
3-
import logging
43

54
from .infer import detect
65
from .infer import detect_multilingual # noqa: F401
@@ -24,14 +23,3 @@ def detect_language(sentence, *, low_memory: bool = True):
2423
if lang_code == "JA" and not is_japanese(sentence):
2524
lang_code = "ZH"
2625
return lang_code
27-
28-
29-
def detect_langs(sentence, *, low_memory: bool = True):
30-
"""
31-
Detect language
32-
:param sentence: str sentence
33-
:param low_memory: bool (default: True) whether to use low memory mode
34-
:return: ZH, EN, JA, KO, FR, DE, ES, .... (two uppercase letters)
35-
"""
36-
logging.warning("detect_langs is deprecated, use detect_language instead")
37-
return detect_language(sentence, low_memory=low_memory)

src/fast_langdetect/ft_detect/infer.py

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
FASTTEXT_LARGE_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
1919
FASTTEXT_LARGE_MODEL_NAME = "lid.176.bin"
20+
VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
2021

2122

2223
class DetectError(Exception):
@@ -38,6 +39,36 @@ def cache_model(self, key: str, model) -> None:
3839

3940
_model_cache = ModelManager()
4041

42+
import hashlib
43+
44+
45+
def calculate_md5(file_path, chunk_size=8192):
46+
"""
47+
Calculate the MD5 hash of a file.
48+
49+
:param file_path: Path to the file
50+
:param chunk_size: Size of each chunk to read from the file
51+
:return: MD5 hash of the file
52+
"""
53+
md5 = hashlib.md5()
54+
with open(file_path, 'rb') as f:
55+
for chunk in iter(lambda: f.read(chunk_size), b''):
56+
md5.update(chunk)
57+
return md5.hexdigest()
58+
59+
60+
def verify_md5(file_path, expected_md5, chunk_size=8192):
61+
"""
62+
Verify the MD5 hash of a file against an expected hash.
63+
64+
:param file_path: Path to the file
65+
:param expected_md5: Expected MD5 hash
66+
:param chunk_size: Size of each chunk to read from the file
67+
:return: True if the file's MD5 hash matches the expected hash, False otherwise
68+
"""
69+
md5 = calculate_md5(file_path, chunk_size)
70+
return md5 == expected_md5
71+
4172

4273
def download_model(
4374
download_url: str,
@@ -62,8 +93,9 @@ def download_model(
6293
folder=str(save_path.parent),
6394
filename=save_path.name,
6495
proxy=proxy,
65-
retry_max=3,
66-
timeout=30,
96+
retry_max=2,
97+
sleep_max=5,
98+
timeout=7,
6799
)
68100
except Exception as e:
69101
logger.error(f"fast-langdetect:Failed to download FastText model from {download_url}: {e}")
@@ -83,18 +115,29 @@ def load_fasttext_model(
83115
:return: FastText model
84116
:raises DetectError: If model loading fails
85117
"""
86-
if not model_path.exists() and download_url:
87-
# Attempt to download the model
88-
download_model(download_url, model_path, proxy)
89-
118+
if all([
119+
VERIFY_FASTTEXT_LARGE_MODEL,
120+
model_path.exists(),
121+
model_path.name == FASTTEXT_LARGE_MODEL_NAME,
122+
]):
123+
if not verify_md5(model_path, VERIFY_FASTTEXT_LARGE_MODEL):
124+
logger.warning(
125+
f"fast-langdetect: MD5 hash verification failed for {model_path}, "
126+
f"please check the integrity of the downloaded file from {FASTTEXT_LARGE_MODEL_URL}. "
127+
"\n This may seriously reduce the prediction accuracy. "
128+
"If you want to ignore this, please set `fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL = None` "
129+
)
90130
if not model_path.exists():
91-
raise DetectError(f"FastText model file not found at {model_path}")
131+
if download_url:
132+
download_model(download_url, model_path, proxy)
133+
if not model_path.exists():
134+
raise DetectError(f"FastText model file not found at {model_path}")
92135

93136
try:
94137
# Load FastText model
95138
return fasttext.load_model(str(model_path))
96139
except Exception as e:
97-
logger.error(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}")
140+
logger.warning(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}")
98141
raise DetectError(f"Failed to load FastText model: {e}")
99142

100143

@@ -131,7 +174,7 @@ def load_model(
131174
_model_cache.cache_model(cache_key, model)
132175
return model
133176
except Exception as e:
134-
logger.error(f"fast-langdetect:Failed to load model ({'low' if low_memory else 'high'} memory): {e}")
177+
logger.warning(f"fast-langdetect:Failed to load model ({'low' if low_memory else 'high'} memory): {e}")
135178
if use_strict_mode:
136179
raise DetectError("Failed to load FastText model.") from e
137180
elif not low_memory:
@@ -149,12 +192,15 @@ def detect(
149192
) -> Dict[str, Union[str, float]]:
150193
"""
151194
Detect the language of a text using FastText.
152-
This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
153-
If the model is not supervised, this function will throw a ValueError.
195+
196+
- You MUST manually remove line breaks(`n`) from the text to be processed in advance, otherwise a ValueError is raised.
197+
198+
- In scenarios **where accuracy is important**, you should not rely on the detection results of small models, use `low_memory=False` to download larger models!
199+
154200
:param text: The text for language detection
155-
:param low_memory: Whether to use a memory-efficient model
201+
:param low_memory: Whether to use the compressed version of the model (https://fasttext.cc/docs/en/language-identification.html)
156202
:param model_download_proxy: Download proxy for the model if needed
157-
:param use_strict_mode: If it was enabled, strictly loads large model or raises error if it fails
203+
:param use_strict_mode: When this parameter is enabled, the fallback after loading failure will be disabled.
158204
:return: A dictionary with detected language and confidence score
159205
:raises LanguageDetectionError: If detection fails
160206
"""
@@ -176,14 +222,26 @@ def detect(
176222
def detect_multilingual(
177223
text: str,
178224
*,
179-
low_memory: bool = True,
225+
low_memory: bool = False,
180226
model_download_proxy: Optional[str] = None,
181227
k: int = 5,
182228
threshold: float = 0.0,
183229
use_strict_mode: bool = False,
184230
) -> List[Dict[str, Any]]:
185231
"""
186232
Detect the top-k probable languages for a given text.
233+
234+
- You MUST manually remove line breaks(`n`) from the text to be processed in advance, otherwise a ValueError is raised.
235+
236+
- In scenarios **where accuracy is important**, you should not rely on the detection results of small models, use `low_memory=False` to download larger models!
237+
238+
:param text: The text for language detection
239+
:param low_memory: Whether to use the compressed version of the model (https://fasttext.cc/docs/en/language-identification.html)
240+
:param model_download_proxy: Download proxy for the model if needed
241+
:param k: Number of top languages to return
242+
:param threshold: Minimum confidence score to consider
243+
:param use_strict_mode: When this parameter is enabled, the fallback after loading failure will be disabled.
244+
:return: A list of dictionaries with detected languages and confidence scores
187245
"""
188246
model = load_model(
189247
low_memory=low_memory,

0 commit comments

Comments
 (0)