Liwu-di
diff --git a/‎src/PaperCrawlerUtil/code_generate.py renamed to ‎src/PaperCrawlerUtil/application.py
Lines changed: 10 additions & 5 deletions b/‎src/PaperCrawlerUtil/code_generate.py renamed to ‎src/PaperCrawlerUtil/application.py
Lines changed: 10 additions & 5 deletions
diff --git a/‎src/PaperCrawlerUtil/common_util.py
Lines changed: 1 addition & 1 deletion b/‎src/PaperCrawlerUtil/common_util.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/PaperCrawlerUtil/constant.py
Lines changed: 23 additions & 0 deletions b/‎src/PaperCrawlerUtil/constant.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/PaperCrawlerUtil/crawler_util.py
Lines changed: 11 additions & 0 deletions b/‎src/PaperCrawlerUtil/crawler_util.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/PaperCrawlerUtil/document_util.py
Lines changed: 4 additions & 1 deletion b/‎src/PaperCrawlerUtil/document_util.py
Lines changed: 4 additions & 1 deletion
@@ -1,23 +1,28 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/9/5 16:50
 # @Author  : 银尘
-# @FileName: code_generate.py
+# @FileName: application.py
 # @Software: PyCharm
 # @Email   ：liwudi@liwudi.fun
 import json
 from common_util import *
 from flask import Flask, request
 from constant import *
 
-code_generate = Flask(__name__)
+"""
+this file is some applications constructed by PaperCrawlerUtil 
+and can run by Flask and provide services to website
+"""
 
+applications = Flask(__name__)
 
-@code_generate.route("/")
+
+@applications.route("/")
 def hello_world():
     return 'hello world'
 
 
-@code_generate.route("/code_generate/", methods=[POST])
+@applications.route("/code_generate/", methods=[POST])
 def generate():
     data = json.loads(request.get_data())
     # 到最终保存或提取文件需要多少层
@@ -42,4 +47,4 @@ def generate():
 
 
 if __name__ == "__main__":
-    code_generate.run(host="0.0.0.0", port=8000, debug=True)
+    applications.run(host="0.0.0.0", port=8000, debug=True)
@@ -309,7 +309,7 @@ def set_cross_file_variable(key_val: List[tuple]) -> bool:
 
 def is_ip(proxy_test: str = "") -> bool:
     """
-    测试字符串是否是一个ip地址
+    测试字符串是否是一个http | https ip地址
     :param proxy_test: 待测试字符串
     :return:是则返回True
     """
 
@@ -37,6 +37,16 @@
 KEEP_PROCESS_BAR_STYLE = "keep_process_bar_style"
 KEEP_PROCESS_BAR_STYLE_FILE = "keep_process_bar_style_file"
 
+
+"""
+chain translate 定义
+"""
+ACCURACY = "accuracy"
+MORE = "MORE"
+GOOGLE_TRANSLATOR = "google"
+BAIDU_TRANSLATOR = "baidu"
+
+
 """
 存储方式定义
 """
@@ -323,6 +333,19 @@
 """祖鲁语"""
 ZU = "zu"
 
+ALL_LANGUAGE_LIST = [AUTO, "auto", AF, "af", SQ, "sq", AM, "am", AR, "ar", HY, "hy", AZ, "az", EU, "eu", BE, "be", BN,
+                     "bn", BS, "bs", BG, "bg", CA, "ca", CEB, "ceb", ZH_CN, "zh-CN", ZH_TW, "zh-TW", CO, "co", HR, "hr",
+                     CS, "cs", DA, "da", NL, "nl", EN, "en", EO, "eo", ET, "et", FI, "fi", FR, "fr", FY, "fy", GL, "gl",
+                     KA, "ka", DE, "de", EL, "el", GU, "gu", HT, "ht", HA, "ha", HAW, "haw", HE, "he", HI, "hi", HMN,
+                     "hmn", HU, "hu", IS_, "is", IG, "ig", ID, "id", GA, "ga", IT, "it", JA, "ja", JV, "jv", KN, "kn",
+                     KK, "kk", KM, "km", RW, "rw", KO, "ko", KU, "ku", KY, "ky", LO, "lo", LA, "la", LV, "lv", LT, "lt",
+                     LB, "lb", MK, "mk", MG, "mg", MS, "ms", ML, "ml", MT, "mt", MI, "mi", MR, "mr", MN, "mn", MY, "my",
+                     NE, "ne", NO, "no", NY, "ny", OR_, "or", PS, "ps", FA, "fa", PL, "pl", PT, "pt", PA, "pa", RO,
+                     "ro", RU, "ru", SM, "sm", GD, "gd", SR, "sr", ST, "st", SN, "sn", SD, "sd", SI, "si", SK, "sk", SL,
+                     "sl", SO, "so", ES, "es", SU, "su", SW, "sw", SV, "sv", TL, "tl", TG, "tg", TA, "ta", TT, "tt", TE,
+                     "te", TH, "th", TR, "tr", TK, "tk", UK, "uk", UR, "ur", UG, "ug", UZ, "uz", VI, "vi", CY, "cy", XH,
+                     "xh", YI, "yi", YO, "yo", ZU, "zu"]
+
 """
 谷歌国家地区域名
 """
 
@@ -486,5 +486,16 @@ def google_scholar_search_crawler(contain_all: List[str] = None, contain_complet
         return html
 
 
+def get_all_link_from_html(html: str, get_type: str = ACCURACY):
+    """
+    获取html中所有链接，有两种模式，一种是保证正确型，只识别http，https开头和href开头的
+    还有一种是全面型，尽可能多的识别链接，比如/adta/download/jafs.pdf等等也识别为链接，这种需要配合前缀链接使用
+    后一种返回时，会分为两部分，一部分是保证正确型，另一部分是尽可能多的识别的链接
+    :param get_type:
+    :param html:
+    :return:
+    """
+
+
 if __name__ == "__main__":
     basic_config(logs_style=LOG_STYLE_PRINT)
@@ -2,6 +2,9 @@
 from PaperCrawlerUtil.pdf_util import *
 from PaperCrawlerUtil.office_util import *
 
-
 if __name__ == "__main__":
     basic_config(logs_style=LOG_STYLE_PRINT)
+    # for i in range(7):
+    #     getSomePagesFromFileOrDirectory(
+    #         path="D:\\python project\\PaperCrawlerUtil\\src\\PaperCrawlerUtil\\09_43_0370543.pdf",
+    #         page_range=(i * 50, (i + 1) * 50))