Skip to content

Commit

Permalink
元素类型引用统一定义
Browse files Browse the repository at this point in the history
  • Loading branch information
xuchao committed Mar 16, 2024
1 parent d5ea44f commit 83753cb
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 16 deletions.
8 changes: 4 additions & 4 deletions demo/ocr_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ def read_json_file(file_path):


if __name__ == '__main__':
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"

# ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
try:
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
pth = Path(ocr_json_file_path)
Expand Down
6 changes: 5 additions & 1 deletion magic_pdf/dict2md/mkcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,11 @@ def mk_mm_markdown(content_list):
if content_type == "text":
content_md.append(c.get("text"))
elif content_type == "equation":
content_md.append(f"$$\n{c.get('latex')}\n$$")
content = c.get("latex")
if content.startswith("$$") and content.endswith("$$"):
content_md.append(content)
else:
content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
elif content_type in UNI_FORMAT_TEXT_TYPE:
content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
elif content_type == "image":
Expand Down
25 changes: 14 additions & 11 deletions magic_pdf/para/para_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from loguru import logger

from magic_pdf.libs.boxbase import _is_in
from magic_pdf.libs.ocr_content_type import ContentType


LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?',":", ":", ")", ")", ";"]
INLINE_EQUATION = 'inline_equation'
INTER_EQUATION = "displayed_equation"
INLINE_EQUATION = ContentType.InlineEquation
INTERLINE_EQUATION = ContentType.InterlineEquation
TEXT = "text"

def __add_line_period(blocks, layout_bboxes):
Expand All @@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
for line in block['lines']:
last_span = line['spans'][-1]
span_type = last_span['type']
if span_type in [TEXT, INLINE_EQUATION]:
if span_type in [INLINE_EQUATION]:
span_content = last_span['content'].strip()
if span_type==INLINE_EQUATION and span_content[-1] not in LINE_STOP_FLAG:
if span_type in [INLINE_EQUATION, INTER_EQUATION]:
if span_type in [INLINE_EQUATION, INTERLINE_EQUATION]:
last_span['content'] = span_content + '.'



def __valign_lines(blocks, layout_bboxes):
"""
对齐行的左侧和右侧。
扫描行的左侧和右侧,如果x0, x1差距不超过3就强行对齐到所处layout的左右两侧(和layout有一段距离)。
3是个经验值,TODO,计算得来
在一个layoutbox内对齐行的左侧和右侧。
扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。
3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。
"""

min_distance = 3
Expand Down Expand Up @@ -159,11 +159,14 @@ def get_span_text(span):
else:
para.append(line)
else: # 其他,图片、表格、行间公式,各自占一段
para.append(line)
paras.append(para)
if len(para)>0:
paras.append(para)
para = []
else:
paras.append([line])
para = []
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
para = []
if len(para)>0:
paras.append(para)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
Expand Down

0 comments on commit 83753cb

Please sign in to comment.