Skip to content

Commit 7e43285

Browse files
kunal gaurkunal gaur
kunal gaur
authored and
kunal gaur
committed
Added 1 More Python Program to Extract Tables From docx,txt and PDF
1 parent 8682a38 commit 7e43285

15 files changed

+107
-0
lines changed

.DS_Store

34 KB
Binary file not shown.
6 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

Extract-Table-from-pdf-txt-docx/Parent/Child1/Text_Child1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
AB,DF,G,DF,SDF,ADA,QW,WE,ER,FD2,45,56,7,8,9,65,3,5436,7812,34,345,667,56,5657,768,45,46,6767,89,8,9,89,8,78,9,67,671,23,4,5,65,76,8,6,45,67
Binary file not shown.
Binary file not shown.
Binary file not shown.

Extract-Table-from-pdf-txt-docx/Parent/Child2/Text_Child2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
AC,DXFC,GB,DCF,SCDF,BADA,QB,W,R,F,C2,45,56,7,8,9,65,3,5436,78,3412,34,345,667,56,5657,768,45,46,67,3467,89,8,9,89,8,78,9,67,67,431,23,4,5,65,76,8,6,45,67,61
Binary file not shown.
Binary file not shown.
Binary file not shown.

Extract-Table-from-pdf-txt-docx/Parent/Child3/Text_Child3.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
AF,FC,GFB,DW,SF,BA,Q,WS,RR,FR,CW2,45,56,7,8,9,65,3,5436,78,3412,34,345,667,56,5657,768,45,46,67,3467,89,8,9,89,8,78,9,67,67,431,23,4,5,65,76,8,6,45,67,61
+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# %%
2+
import pandas as pd
3+
import os
4+
import tabula
5+
from docx.api import Document
6+
7+
# %%
8+
9+
if os.path.isdir('Parent')== True:
10+
os.chdir('Parent')
11+
#FOR CHILD1 DIRECTORY
12+
if os.path.isdir('Child1')==True:
13+
os.chdir('Child1')
14+
#PDF FILE READING
15+
if os.path.isfile('Pdf1_Child1.pdf')==True:
16+
df_pdf_child1=tabula.read_pdf('Pdf1_Child1.pdf',pages='all')
17+
#DOCUMENT READING
18+
if os.path.isfile('Document_Child1.docx')==True:
19+
document = Document('Document_Child1.docx')
20+
table = document.tables[0]
21+
data = []
22+
23+
keys = None
24+
for i, row in enumerate(table.rows):
25+
text = (cell.text for cell in row.cells)
26+
if i == 0:
27+
keys = tuple(text)
28+
continue
29+
row_data = dict(zip(keys, text))
30+
data.append(row_data)
31+
df_document_child1=pd.DataFrame(data)
32+
#TEXT READING
33+
if os.path.isfile('Text_Child1.txt')==True:
34+
df_text_child1=pd.read_csv('Text_Child1.txt')
35+
36+
# %%
37+
df_text_child1
38+
39+
40+
# %%
41+
os.chdir('../')
42+
if os.path.isdir('Parent')== True:
43+
os.chdir('Parent')
44+
#FOR CHILD2 DIRECTORY
45+
if os.path.isdir('Child2')==True:
46+
os.chdir('Child2')
47+
#PDF FILE READING
48+
if os.path.isfile('Pdf1_Child2.pdf')==True:
49+
df_pdf_child2=tabula.read_pdf('Pdf1_Child2.pdf',pages='all')
50+
#DOCUMENT READING
51+
if os.path.isfile('Document_Child2.docx')==True:
52+
document = Document('Document_Child2.docx')
53+
table = document.tables[0]
54+
data = []
55+
56+
keys = None
57+
for i, row in enumerate(table.rows):
58+
text = (cell.text for cell in row.cells)
59+
if i == 0:
60+
keys = tuple(text)
61+
continue
62+
row_data = dict(zip(keys, text))
63+
data.append(row_data)
64+
df_document_child2=pd.DataFrame(data)
65+
#TEXT READING
66+
if os.path.isfile('Text_Child2.txt')==True:
67+
df_text_child2=pd.read_csv('Text_Child2.txt')
68+
69+
# %%
70+
df_pdf_child2[0].head(4)
71+
72+
# %%
73+
os.chdir('../')
74+
if os.path.isdir('Parent')== True:
75+
os.chdir('Parent')
76+
#FOR CHILD3 DIRECTORY
77+
if os.path.isdir('Child3')==True:
78+
os.chdir('Child3')
79+
#PDF FILE READING
80+
if os.path.isfile('Pdf1_Child3.pdf')==True:
81+
df_pdf_child3=tabula.read_pdf('Pdf1_Child3.pdf',pages='all')
82+
#DOCUMENT READING
83+
if os.path.isfile('Document_Child3.docx')==True:
84+
document = Document('Document_Child3.docx')
85+
table = document.tables[0]
86+
data = []
87+
88+
keys = None
89+
for i, row in enumerate(table.rows):
90+
text = (cell.text for cell in row.cells)
91+
if i == 0:
92+
keys = tuple(text)
93+
continue
94+
row_data = dict(zip(keys, text))
95+
data.append(row_data)
96+
df_document_child3=pd.DataFrame(data)
97+
#TEXT READING
98+
if os.path.isfile('Text_Child3.txt')==True:
99+
df_text_child3=pd.read_csv('Text_Child3.txt')
100+
101+
# %%
102+
df_text_child3
103+
104+
# %%

0 commit comments

Comments
 (0)