forked from academicpages/academicpages.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubsFromBib.py
160 lines (122 loc) · 5.88 KB
/
pubsFromBib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
# coding: utf-8
# # Publications markdown generator for academicpages
#
# Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)).
#
# The core python code is also in `pubsFromBibs.py`.
# Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
# * bib file names
# * specific venue keys based on your bib file preferences
# * any specific pre-text for specific files
# * Collection Name (future feature)
#
# TODO: Make this work with other databases of citations,
# TODO: Merge this with the existing TSV parsing solution
from pybtex.database.input import bibtex
import pybtex.database.input.bibtex
from time import strptime
import string
import html
import os
import re
#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
publist = {
"proceeding": {
"file" : "proceedings.bib",
"venuekey": "booktitle",
"venue-pretext": "In the proceedings of ",
"collection" : {"name":"publications",
"permalink":"/publication/"}
},
"journal":{
"file": "pubs.bib",
"venuekey" : "journal",
"venue-pretext" : "",
"collection" : {"name":"publications",
"permalink":"/publication/"}
}
}
html_escape_table = {
"&": "&",
'"': """,
"'": "'"
}
def html_escape(text):
"""Produce entities within text."""
return "".join(html_escape_table.get(c,c) for c in text)
for pubsource in publist:
parser = bibtex.Parser()
bibdata = parser.parse_file(publist[pubsource]["file"])
#loop through the individual references in a given bibtex file
for bib_id in bibdata.entries:
#reset default date
pub_year = "1900"
pub_month = "01"
pub_day = "01"
b = bibdata.entries[bib_id].fields
try:
pub_year = f'{b["year"]}'
#todo: this hack for month and day needs some cleanup
if "month" in b.keys():
if(len(b["month"])<3):
pub_month = "0"+b["month"]
pub_month = pub_month[-2:]
elif(b["month"] not in range(12)):
tmnth = strptime(b["month"][:3],'%b').tm_mon
pub_month = "{:02d}".format(tmnth)
else:
pub_month = str(b["month"])
if "day" in b.keys():
pub_day = str(b["day"])
pub_date = pub_year+"-"+pub_month+"-"+pub_day
#strip out {} as needed (some bibtex entries that maintain formatting)
clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")
url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
url_slug = url_slug.replace("--","-")
md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")
#Build Citation from text
citation = ""
#citation authors - todo - add highlighting for primary author?
for author in bibdata.entries[bib_id].persons["author"]:
citation = citation+" "+author.first_names[0]+" "+author.last_names[0]+", "
#citation title
citation = citation + "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""
#add venue logic depending on citation type
venue = publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","")
citation = citation + " " + html_escape(venue)
citation = citation + ", " + pub_year + "."
## YAML variables
md = "---\ntitle: \"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'
md += """collection: """ + publist[pubsource]["collection"]["name"]
md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"] + html_filename
note = False
if "note" in b.keys():
if len(str(b["note"])) > 5:
md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
note = True
md += "\ndate: " + str(pub_date)
md += "\nvenue: '" + html_escape(venue) + "'"
url = False
if "url" in b.keys():
if len(str(b["url"])) > 5:
md += "\npaperurl: '" + b["url"] + "'"
url = True
md += "\ncitation: '" + html_escape(citation) + "'"
md += "\n---"
## Markdown description for individual page
if note:
md += "\n" + html_escape(b["note"]) + "\n"
if url:
md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n"
else:
md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"
md_filename = os.path.basename(md_filename)
with open("../_publications/" + md_filename, 'w') as f:
f.write(md)
print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
# field may not exist for a reference
except KeyError as e:
print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
continue