Skip to content

Commit f63753a

Browse files
committed
修复bug
1 parent 370a759 commit f63753a

File tree

1 file changed

+21
-18
lines changed

1 file changed

+21
-18
lines changed

src/PaperCrawlerUtil/pdf_util.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ def pdf2docx(pdf_path: str, word_path: str, end_pages: int = None,
472472
file_list = []
473473
file = True
474474
count = 0
475-
if os.path.isfile(pdf_path) and os.path.isfile:
475+
if os.path.isfile(pdf_path):
476476
file_list.append(pdf_path)
477477
if need_log:
478478
log("转换文件{}开始".format(pdf_path))
@@ -484,16 +484,13 @@ def pdf2docx(pdf_path: str, word_path: str, end_pages: int = None,
484484
for ele in file_list:
485485
if ele.endswith(".pdf"):
486486
try:
487-
cv = Converter(ele)
487+
cv = MyPdf2DocxConverter(ele)
488488
if start_pages is None:
489489
start_pages = 0
490490
if end_pages is None:
491491
end_pages = len(cv.pages)
492-
if not file:
493-
cv.convert(local_path_generate(word_path), start=start_pages, end=end_pages)
494-
else:
495-
cv.convert(local_path_generate(word_path, suffix=".docx"), start=start_pages, end=end_pages)
496-
count = count + 1
492+
cv.convert(local_path_generate(word_path, suffix=".docx"), start=start_pages, end=end_pages)
493+
count = count + 1
497494
log("总计pdf文件个数{},已经完成{}".format(len(file_list), count))
498495
except Exception as e:
499496
log(string="转换失败文件{},{}".format(ele, e), print_file=sys.stderr)
@@ -691,11 +688,6 @@ def getSomePagesFromOnePDF(path: str, out_path: str, page_range: tuple or list,
691688
iters = None
692689
if type(page_range) == tuple:
693690
new_page_range = ()
694-
for k in page_range:
695-
'''@todo: 完善verify_rule()'''
696-
if not (0 <= k <= pdf_pages_len - 1):
697-
log(string="范围参数有错", print_file=sys.stderr)
698-
return False
699691
if len(page_range) == 0:
700692
log(string="页码范围不明确,返回错误", print_file=sys.stderr)
701693
return False
@@ -707,6 +699,17 @@ def getSomePagesFromOnePDF(path: str, out_path: str, page_range: tuple or list,
707699
new_page_range = (page_range[0], page_range[1])
708700
else:
709701
new_page_range = (page_range[0], page_range[1])
702+
# check tuple legal or not
703+
a = []
704+
for k in new_page_range:
705+
if k < 0:
706+
a.append(0)
707+
elif k > pdf_pages_len:
708+
a.append(pdf_pages_len)
709+
a.append(k)
710+
new_page_range = (a[0], a[1])
711+
if new_page_range[1] < new_page_range[0]:
712+
new_page_range = (new_page_range[1], new_page_range[0])
710713
iters = range(new_page_range[0], new_page_range[1])
711714
else:
712715
# 去重
@@ -809,7 +812,7 @@ def cooperatePdfWithLimit(files: list, page_range: tuple or list = None, out_pat
809812
for p in range(len(out_path) - 4):
810813
temp.append(out_path[p])
811814
out_path = "".join(temp)
812-
out_path = (out_path + group_id + ".pdf") if len(out_path) != 0 else local_path_generate("")
815+
out_path = (out_path + "-groupid-" +group_id + ".pdf") if len(out_path) != 0 else local_path_generate("")
813816
else:
814817
out_path = out_path if len(out_path) != 0 else local_path_generate("")
815818
count = 0
@@ -837,12 +840,11 @@ def cooperatePdfWithLimit(files: list, page_range: tuple or list = None, out_pat
837840
if len(page_range) == 0:
838841
if need_log:
839842
log("默认全部合并,因为范围为空")
840-
new_page_range[0] = 0
841-
new_page_range[1] = pdf_pages_len - 1
843+
new_page_range = (0, pdf_pages_len)
842844
elif len(page_range) == 1:
843845
if need_log:
844846
log("使用范围截取,但只有一个参数,结束参数默认为最大值")
845-
new_page_range = (page_range[0], pdf_pages_len - 1)
847+
new_page_range = (page_range[0], pdf_pages_len)
846848
elif len(page_range) > 2:
847849
if need_log:
848850
log("使用范围参数,但参数数量过多,截取两个")
@@ -911,13 +913,12 @@ def cooperatePdf(path: str, page_range: tuple or list = None, out_path: str = ""
911913
:param need_group: 是否需要分组合并
912914
:return:
913915
"""
914-
need_log = need_log if not need_processbar else False
915916
if len(path) == 0:
916917
log("给定路径为空,合并结束:{}".format(path))
917918
elif os.path.isfile(path):
918919
log("给定的是文件路径,合并结束:{}".format(path))
919920
if page_range is None:
920-
page_range = []
921+
page_range = ()
921922
files = getAllFiles(path)
922923
if need_group:
923924
if need_processbar:
@@ -928,6 +929,8 @@ def cooperatePdf(path: str, page_range: tuple or list = None, out_path: str = ""
928929
file_group = []
929930
while i < len(files):
930931
if (i != 0 and ((i % group_number) == 0)) or (i == len(files) - 1):
932+
if i == len(files) - 1:
933+
file_group.append(files[i])
931934
cooperatePdfWithLimit(file_group, page_range, out_path, need_log, timeout, str(group_id))
932935
group_id = group_id + 1
933936
file_group.clear()

0 commit comments

Comments
 (0)