Skip to content

Commit 9d2c7d0

Browse files
committed
Merge branch 'hotfix_fix_pdf_util'
# Conflicts: # src/PaperCrawlerUtil/pdf_util.py
2 parents e6ebefb + ee62ad5 commit 9d2c7d0

File tree

8 files changed

+591
-45
lines changed

8 files changed

+591
-45
lines changed

src/PaperCrawlerUtil/code_generate.py renamed to src/PaperCrawlerUtil/application.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,28 @@
11
# -*- coding: utf-8 -*-
22
# @Time : 2022/9/5 16:50
33
# @Author : 银尘
4-
# @FileName: code_generate.py
4+
# @FileName: application.py
55
# @Software: PyCharm
66
# @Email :liwudi@liwudi.fun
77
import json
88
from common_util import *
99
from flask import Flask, request
1010
from constant import *
1111

12-
code_generate = Flask(__name__)
12+
"""
13+
this file is some applications constructed by PaperCrawlerUtil
14+
and can run by Flask and provide services to website
15+
"""
1316

17+
applications = Flask(__name__)
1418

15-
@code_generate.route("/")
19+
20+
@applications.route("/")
1621
def hello_world():
1722
return 'hello world'
1823

1924

20-
@code_generate.route("/code_generate/", methods=[POST])
25+
@applications.route("/code_generate/", methods=[POST])
2126
def generate():
2227
data = json.loads(request.get_data())
2328
# 到最终保存或提取文件需要多少层
@@ -42,4 +47,4 @@ def generate():
4247

4348

4449
if __name__ == "__main__":
45-
code_generate.run(host="0.0.0.0", port=8000, debug=True)
50+
applications.run(host="0.0.0.0", port=8000, debug=True)

src/PaperCrawlerUtil/common_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ def set_cross_file_variable(key_val: List[tuple]) -> bool:
309309

310310
def is_ip(proxy_test: str = "") -> bool:
311311
"""
312-
测试字符串是否是一个ip地址
312+
测试字符串是否是一个http | https ip地址
313313
:param proxy_test: 待测试字符串
314314
:return:是则返回True
315315
"""

src/PaperCrawlerUtil/constant.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,16 @@
3737
KEEP_PROCESS_BAR_STYLE = "keep_process_bar_style"
3838
KEEP_PROCESS_BAR_STYLE_FILE = "keep_process_bar_style_file"
3939

40+
41+
"""
42+
chain translate 定义
43+
"""
44+
ACCURACY = "accuracy"
45+
MORE = "MORE"
46+
GOOGLE_TRANSLATOR = "google"
47+
BAIDU_TRANSLATOR = "baidu"
48+
49+
4050
"""
4151
存储方式定义
4252
"""
@@ -323,6 +333,19 @@
323333
"""祖鲁语"""
324334
ZU = "zu"
325335

336+
ALL_LANGUAGE_LIST = [AUTO, "auto", AF, "af", SQ, "sq", AM, "am", AR, "ar", HY, "hy", AZ, "az", EU, "eu", BE, "be", BN,
337+
"bn", BS, "bs", BG, "bg", CA, "ca", CEB, "ceb", ZH_CN, "zh-CN", ZH_TW, "zh-TW", CO, "co", HR, "hr",
338+
CS, "cs", DA, "da", NL, "nl", EN, "en", EO, "eo", ET, "et", FI, "fi", FR, "fr", FY, "fy", GL, "gl",
339+
KA, "ka", DE, "de", EL, "el", GU, "gu", HT, "ht", HA, "ha", HAW, "haw", HE, "he", HI, "hi", HMN,
340+
"hmn", HU, "hu", IS_, "is", IG, "ig", ID, "id", GA, "ga", IT, "it", JA, "ja", JV, "jv", KN, "kn",
341+
KK, "kk", KM, "km", RW, "rw", KO, "ko", KU, "ku", KY, "ky", LO, "lo", LA, "la", LV, "lv", LT, "lt",
342+
LB, "lb", MK, "mk", MG, "mg", MS, "ms", ML, "ml", MT, "mt", MI, "mi", MR, "mr", MN, "mn", MY, "my",
343+
NE, "ne", NO, "no", NY, "ny", OR_, "or", PS, "ps", FA, "fa", PL, "pl", PT, "pt", PA, "pa", RO,
344+
"ro", RU, "ru", SM, "sm", GD, "gd", SR, "sr", ST, "st", SN, "sn", SD, "sd", SI, "si", SK, "sk", SL,
345+
"sl", SO, "so", ES, "es", SU, "su", SW, "sw", SV, "sv", TL, "tl", TG, "tg", TA, "ta", TT, "tt", TE,
346+
"te", TH, "th", TR, "tr", TK, "tk", UK, "uk", UR, "ur", UG, "ug", UZ, "uz", VI, "vi", CY, "cy", XH,
347+
"xh", YI, "yi", YO, "yo", ZU, "zu"]
348+
326349
"""
327350
谷歌国家地区域名
328351
"""

src/PaperCrawlerUtil/crawler_util.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,5 +486,16 @@ def google_scholar_search_crawler(contain_all: List[str] = None, contain_complet
486486
return html
487487

488488

489+
def get_all_link_from_html(html: str, get_type: str = ACCURACY):
490+
"""
491+
获取html中所有链接,有两种模式,一种是保证正确型,只识别http,https开头和href开头的
492+
还有一种是全面型,尽可能多的识别链接,比如/adta/download/jafs.pdf等等也识别为链接,这种需要配合前缀链接使用
493+
后一种返回时,会分为两部分,一部分是保证正确型,另一部分是尽可能多的识别的链接
494+
:param get_type:
495+
:param html:
496+
:return:
497+
"""
498+
499+
489500
if __name__ == "__main__":
490501
basic_config(logs_style=LOG_STYLE_PRINT)

src/PaperCrawlerUtil/document_util.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
from PaperCrawlerUtil.pdf_util import *
33
from PaperCrawlerUtil.office_util import *
44

5-
65
if __name__ == "__main__":
76
basic_config(logs_style=LOG_STYLE_PRINT)
7+
# for i in range(7):
8+
# getSomePagesFromFileOrDirectory(
9+
# path="D:\\python project\\PaperCrawlerUtil\\src\\PaperCrawlerUtil\\09_43_0370543.pdf",
10+
# page_range=(i * 50, (i + 1) * 50))

0 commit comments

Comments
 (0)