forked from baulbo/Diard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
102 lines (84 loc) · 3.12 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import argparse
import logging
import os
import warnings
from modules.document import Document
from modules.layoutdetection import LayoutDetection
from modules.tables import TableExtractor
def main():
parser = argparse.ArgumentParser(description="Diard pipeline script")
parser.add_argument(
"--overwrite",
help="Reprocess and overwrite previously processed documents",
action="store_true",
default=False
)
parser.add_argument(
"--skip-failures",
help="Skips files that cannot be processed instead of exiting program",
action="store_true",
default=False
)
args = parser.parse_args()
logging.basicConfig(
format="%(asctime)s | %(levelname)s: %(message)s", level=logging.NOTSET
)
logging.disable(logging.DEBUG)
# suppressing PyTorch & Detectron warnings
warnings.filterwarnings(
"ignore", category=UserWarning
) # NOTE: comment out for debugging
ld_config_path = "./resources/model_configs/cascade/cascade_dit_large.yaml"
ld_weights_path = "./resources/weights/publaynet_dit-l_cascade.pth"
ld = LayoutDetection(
cfg_path=ld_config_path,
weights_path=ld_weights_path,
device='cuda', # change to 'cpu' if you don't have CUDA enabled GPU
batch_size=1,
workers=4,
threshold=0.65,
)
predictor = ld.get_predictor()
metadata = ld.get_metadata()
source_dir = "./resources/pdfs/"
output_dir = "./output/"
# language used in most of your documents (ISO 639-3 format)
lang = "deu"
# only useful if lang_detect=True (all specified language packs should be installed)
langs = ["eng", "fra", "deu"]
# process single pdf
# filenames = ["example.pdf"]
# process multiple pdfs
filenames = os.listdir(source_dir)
# filter out previously processed documents
if not args.overwrite:
processed_files = os.listdir(output_dir)
n_docs = len(filenames)
skipped_files = [fn for fn in filenames if '.'.join(fn.split('.')[:-1]) in processed_files]
filenames = [fn for fn in filenames if '.'.join(fn.split('.')[:-1]) not in processed_files]
if len(filenames) < n_docs:
logging.info(f"Skipping the following {len(skipped_files)} file(s): {skipped_files}")
for filename in filenames:
try:
doc_path = source_dir + filename
doc = Document(
doc_path,
predictor=predictor,
metadata=metadata,
lang=lang,
lang_detect=True,
langs=langs
)
# extract & export layouts
doc.doc_to_images()
doc.extract_layouts(visualize=True, segment_sections=True)
doc.order_layouts()
doc.save_layouts_as_json()
doc.save_layouts_as_html()
except Exception as ex:
if not args.skip_failures:
logging.warning("Exiting")
raise ex
logging.warning(f"Could not process '{filename}'. Exception: {ex}")
if __name__ == "__main__":
main()