17
17
18
18
FASTTEXT_LARGE_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
19
19
FASTTEXT_LARGE_MODEL_NAME = "lid.176.bin"
20
+ VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
20
21
21
22
22
23
class DetectError (Exception ):
@@ -38,6 +39,36 @@ def cache_model(self, key: str, model) -> None:
38
39
39
40
_model_cache = ModelManager ()
40
41
42
+ import hashlib
43
+
44
+
45
+ def calculate_md5 (file_path , chunk_size = 8192 ):
46
+ """
47
+ Calculate the MD5 hash of a file.
48
+
49
+ :param file_path: Path to the file
50
+ :param chunk_size: Size of each chunk to read from the file
51
+ :return: MD5 hash of the file
52
+ """
53
+ md5 = hashlib .md5 ()
54
+ with open (file_path , 'rb' ) as f :
55
+ for chunk in iter (lambda : f .read (chunk_size ), b'' ):
56
+ md5 .update (chunk )
57
+ return md5 .hexdigest ()
58
+
59
+
60
+ def verify_md5 (file_path , expected_md5 , chunk_size = 8192 ):
61
+ """
62
+ Verify the MD5 hash of a file against an expected hash.
63
+
64
+ :param file_path: Path to the file
65
+ :param expected_md5: Expected MD5 hash
66
+ :param chunk_size: Size of each chunk to read from the file
67
+ :return: True if the file's MD5 hash matches the expected hash, False otherwise
68
+ """
69
+ md5 = calculate_md5 (file_path , chunk_size )
70
+ return md5 == expected_md5
71
+
41
72
42
73
def download_model (
43
74
download_url : str ,
@@ -84,12 +115,23 @@ def load_fasttext_model(
84
115
:return: FastText model
85
116
:raises DetectError: If model loading fails
86
117
"""
87
- if not model_path .exists () and download_url :
88
- # Attempt to download the model
89
- download_model (download_url , model_path , proxy )
90
-
118
+ if all ([
119
+ VERIFY_FASTTEXT_LARGE_MODEL ,
120
+ model_path .exists (),
121
+ model_path .name == FASTTEXT_LARGE_MODEL_NAME ,
122
+ ]):
123
+ if not verify_md5 (model_path , VERIFY_FASTTEXT_LARGE_MODEL ):
124
+ logger .warning (
125
+ f"fast-langdetect: MD5 hash verification failed for { model_path } , "
126
+ f"please check the integrity of the downloaded file from { FASTTEXT_LARGE_MODEL_URL } . "
127
+ "\n This may seriously reduce the prediction accuracy. "
128
+ "If you want to ignore this, please set `fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL = None` "
129
+ )
91
130
if not model_path .exists ():
92
- raise DetectError (f"FastText model file not found at { model_path } " )
131
+ if download_url :
132
+ download_model (download_url , model_path , proxy )
133
+ if not model_path .exists ():
134
+ raise DetectError (f"FastText model file not found at { model_path } " )
93
135
94
136
try :
95
137
# Load FastText model
0 commit comments