Skip to content

Commit

Permalink
OCR line的左右侧如果超过layoutbox,那么让layoutbox截断左右侧
Browse files Browse the repository at this point in the history
  • Loading branch information
[email protected] committed Apr 10, 2024
1 parent ec187a1 commit c3b8f6d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
6 changes: 3 additions & 3 deletions demo/ocr_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ def ocr_parse_pdf_core(pdf_bytes, model_output_json_list, book_name, start_page_
if __name__ == '__main__':
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path)
book_name = "科数网/edu_00011318"
ocr_online_parse(book_name)
ocr_local_parse(pdf_path, json_file_path)
# book_name = "科数网/edu_00011318"
# ocr_online_parse(book_name)

pass
25 changes: 22 additions & 3 deletions magic_pdf/para/para_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,31 @@ def __valign_lines(blocks, layout_bboxes):
return new_layout_bboxes


def __align_text_in_layout(blocks, layout_bboxes):
"""
由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。
"""
for layout in layout_bboxes:
lb = layout['layout_bbox']
blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
if len(blocks_in_layoutbox)==0:
continue

for block in blocks_in_layoutbox:
for line in block['lines']:
x0, x1 = line['bbox'][0], line['bbox'][2]
if x0 < lb[0]:
line['bbox'][0] = lb[0]
if x1 > lb[2]:
line['bbox'][2] = lb[2]


def __common_pre_proc(blocks, layout_bboxes):
"""
不分语言的,对文本进行预处理
"""
#__add_line_period(blocks, layout_bboxes)
__align_text_in_layout(blocks, layout_bboxes)
aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)

return aligned_layout_bboxes
Expand Down Expand Up @@ -233,7 +253,6 @@ def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_
layout_paras = []
right_tail_distance = 1.5 * char_avg_len


for lines in lines_group:
paras = []
total_lines = len(lines)
Expand Down Expand Up @@ -575,8 +594,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):


return connected_layout_paras, page_list_info


def para_split(pdf_info_dict, debug_mode, lang="en"):
"""
根据line和layout情况进行分段
Expand Down

0 comments on commit c3b8f6d

Please sign in to comment.