-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_abstract.py
153 lines (130 loc) · 6.32 KB
/
extract_abstract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import requests
import fitz
import re
import os
import json
def extract_abstract(url):
# Set headers to avoid the 406 error
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
"Accept": "application/pdf",
}
# Fetch the PDF file content
response = requests.get(url, headers=headers)
if response.status_code == 200:
# Save the PDF content to a temporary file
with open("temp.pdf", "wb") as f:
f.write(response.content)
# Open the PDF file using PyMuPDF (fitz)
try:
with fitz.open("temp.pdf") as doc:
full_text = ""
# Iterate through the pages to find the abstract
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Extract the text using PyMuPDF's text extraction with layout
full_text += page.get_text("text") # Extract all the text in layout
except:
print("Could not load pdf")
full_text = ""
# Split the extracted text into lines
lines = full_text.split('\n')
# Iterate through the lines and extract the abstract
abstract_lines = []
capture = False
for line in lines:
# Check if this line contains the word "Abstract" (case-insensitive)
if re.search(r"\babstract\b", line, re.IGNORECASE):
capture = True
continue
# If we are capturing the abstract, collect the lines
if capture:
# Stop capturing if we hit a typical section header or an empty line
if re.match(r"(\d\.|Background|Introduction|System Overview|Methods|References)", line, re.IGNORECASE) or line.strip() == "":
capture = False
break
abstract_lines.append(line.strip())
# Join the extracted lines to form the abstract text
abstract = " ".join(abstract_lines).strip()
# remove pdf
os.remove("temp.pdf")
return abstract
else:
print(f"Failed to fetch the PDF file. Status code: {response.status_code}")
return None
def process_json_files(folder_path):
# Get all JSON files in the 'proceedings' folder
for filename in os.listdir(folder_path):
if filename.endswith(".json"):
print('Processing ', filename)
file_path = os.path.join(folder_path, filename)
# Open and load the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Iterate through each entry in the JSON file
for entry in data:
# Check if the abstract is null or contains '[TODO]'
if not entry.get('abstract') or '[TODO]' in entry.get('abstract'):
print('Processing: ', entry.get('title'))
url = entry.get('ee')
if url is None or 'https' not in url:
try:
file_key = requests.get(f"https://zenodo.org/api/records/{entry.get('zenodo_id')}/files").json()["entries"][0]["key"]
url = f"https://zenodo.org/records/{entry.get('zenodo_id')}/files/{file_key}"
entry['ee'] = url
# Extract the abstract using the URL
abstract = extract_abstract(url)
except:
print("could not extract url")
abstract = None
else:
# Extract the abstract using the URL
abstract = extract_abstract(url)
# If the abstract is successfully extracted, update it in the JSON
if abstract:
entry['abstract'] = abstract
else:
print(f"Failed to extract abstract for {entry['title']}")
# Save the updated JSON data back to the file
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print(f"Processed {filename}")
def clean_up_abstracts(proceedings_folder):
"""
Function that cleans up the abstracts by removing HTML tags and entities.
"""
for filename in os.listdir(proceedings_folder):
if filename.endswith(".json"):
file_path = os.path.join(proceedings_folder, filename)
with open(file_path) as file:
data = json.load(file)
for paper in data:
abstract = paper.get('abstract')
if abstract:
# Remove all html tags (e.g. </p>\n\n<p> </p>)
abstract = re.sub(r'<[^>]*>', '', abstract)
# Remove html entities
abstract = re.sub(r'&[a-z]+;', '', abstract)
# replace "\ufb01" by "fi"
abstract = abstract.replace("\ufb01", "fi")
# replace "\ufb02" by "fl"
abstract = abstract.replace("\ufb02", "fl")
# replace "\ufb03" by "ffi"
abstract = abstract.replace("\ufb03", "ffi")
# replace "\ufb04" by "ffl"
abstract = abstract.replace("\ufb04", "ffl")
# replace "\ufb00" by "ff"
abstract = abstract.replace("\ufb00", "ff")
# replace "\uf6d9" by ""
abstract = abstract.replace("\uf6d9", "")
# replace "\uffff" by ""
abstract = abstract.replace("\uffff", "")
paper['abstract'] = abstract
with open(file_path, 'w') as file:
json.dump(data, file, indent=4)
# Set the path to the 'proceedings' folder
proceedings_folder = 'proceedings'
# Call the function to process the JSON files
process_json_files(proceedings_folder)
# Call the function to clean up the abstracts
clean_up_abstracts(proceedings_folder)