Skip to content

Commit 1859e86

Browse files
committed
✨ feat(app): add MD5 verification for FastText model integrity
Introduce MD5 hash verification for the FastText model download. This ensures the integrity of the model file, reducing prediction errors due to corrupted downloads.
1 parent 2628dfe commit 1859e86

File tree

1 file changed

+47
-5
lines changed

1 file changed

+47
-5
lines changed

src/fast_langdetect/ft_detect/infer.py

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
FASTTEXT_LARGE_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
1919
FASTTEXT_LARGE_MODEL_NAME = "lid.176.bin"
20+
VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
2021

2122

2223
class DetectError(Exception):
@@ -38,6 +39,36 @@ def cache_model(self, key: str, model) -> None:
3839

3940
_model_cache = ModelManager()
4041

42+
import hashlib
43+
44+
45+
def calculate_md5(file_path, chunk_size=8192):
46+
"""
47+
Calculate the MD5 hash of a file.
48+
49+
:param file_path: Path to the file
50+
:param chunk_size: Size of each chunk to read from the file
51+
:return: MD5 hash of the file
52+
"""
53+
md5 = hashlib.md5()
54+
with open(file_path, 'rb') as f:
55+
for chunk in iter(lambda: f.read(chunk_size), b''):
56+
md5.update(chunk)
57+
return md5.hexdigest()
58+
59+
60+
def verify_md5(file_path, expected_md5, chunk_size=8192):
61+
"""
62+
Verify the MD5 hash of a file against an expected hash.
63+
64+
:param file_path: Path to the file
65+
:param expected_md5: Expected MD5 hash
66+
:param chunk_size: Size of each chunk to read from the file
67+
:return: True if the file's MD5 hash matches the expected hash, False otherwise
68+
"""
69+
md5 = calculate_md5(file_path, chunk_size)
70+
return md5 == expected_md5
71+
4172

4273
def download_model(
4374
download_url: str,
@@ -84,12 +115,23 @@ def load_fasttext_model(
84115
:return: FastText model
85116
:raises DetectError: If model loading fails
86117
"""
87-
if not model_path.exists() and download_url:
88-
# Attempt to download the model
89-
download_model(download_url, model_path, proxy)
90-
118+
if all([
119+
VERIFY_FASTTEXT_LARGE_MODEL,
120+
model_path.exists(),
121+
model_path.name == FASTTEXT_LARGE_MODEL_NAME,
122+
]):
123+
if not verify_md5(model_path, VERIFY_FASTTEXT_LARGE_MODEL):
124+
logger.warning(
125+
f"fast-langdetect: MD5 hash verification failed for {model_path}, "
126+
f"please check the integrity of the downloaded file from {FASTTEXT_LARGE_MODEL_URL}. "
127+
"\n This may seriously reduce the prediction accuracy. "
128+
"If you want to ignore this, please set `fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL = None` "
129+
)
91130
if not model_path.exists():
92-
raise DetectError(f"FastText model file not found at {model_path}")
131+
if download_url:
132+
download_model(download_url, model_path, proxy)
133+
if not model_path.exists():
134+
raise DetectError(f"FastText model file not found at {model_path}")
93135

94136
try:
95137
# Load FastText model

0 commit comments

Comments
 (0)