forked from acl-org/acl-anthology
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrepair_url.py
executable file
·84 lines (68 loc) · 2.54 KB
/
repair_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""Repairs URLs in Anthology XML files.
Usage: repair_url.py <infilename> <outfilename>
To do:
- check URLs of attachments
- incorporate into a more general XML-fixing script
"""
import lxml.etree as etree
import requests
import sys
from anthology.utils import test_url
def get_anth_url(volume_id, paper_id=None, width=4):
return "https://aclanthology.org/{volume_id}-{paper_id:0{width}d}".format(
volume_id=volume_id, paper_id=paper_id, width=width
)
if __name__ == "__main__":
filename = sys.argv[1]
outfilename = sys.argv[2]
tree = etree.parse(filename)
volume = tree.getroot()
for paper in volume.findall("paper"):
if "href" in paper.attrib:
if not test_url(paper.attrib["href"]):
sys.stderr.write(
"{}:{} removing href attribute: {}\n".format(
filename, paper.sourceline, paper.attrib["href"]
)
)
del paper.attrib["href"]
href = paper.find("href")
if href is not None:
assert len(href) == 0
if not test_url(href.text):
sys.stderr.write(
"{}:{} removing href element: {}\n".format(
filename, href.sourceline, href.text
)
)
paper.remove(href)
anth_url = get_anth_url(volume.attrib["id"], int(paper.attrib["id"]))
anth_url_good = test_url(anth_url)
url = paper.find("url")
assert url is None or len(url) == 0
if url is None:
if anth_url_good:
url = etree.Element("url")
url.text = anth_url
sys.stderr.write(
"{}:{} inserting url element: {}\n".format(
filename, paper.sourceline, anth_url
)
)
paper.append(url)
else:
if anth_url_good and url.text != anth_url:
sys.stderr.write(
"{}:{} rewriting url: {} -> {}\n".format(
filename, url.sourceline, url.text, anth_url
)
)
url.text = anth_url
else:
sys.stderr.write(
"{}:{} removing url element because {} is bad\n".format(
filename, url.sourceline, anth_url
)
)
paper.remove(url)
tree.write(outfilename, encoding="UTF-8", xml_declaration=True, with_tail=True)