-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
147 lines (130 loc) · 5.95 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import time
import pytz
import shutil
import datetime
from typing import List, Dict
import urllib, urllib.request
import feedparser
from easydict import EasyDict
def remove_duplicated_spaces(text: str) -> str:
return " ".join(text.split())
def request_paper_with_arXiv_api(keyword: str, max_results: int, link: str = "OR") -> List[Dict[str, str]]:
# keyword = keyword.replace(" ", "+")
assert link in ["OR", "AND"], "link should be 'OR' or 'AND'"
keyword = "\"" + keyword + "\""
url = "http://export.arxiv.org/api/query?search_query=(ti:{0}+{2}+abs:{0})&max_results={1}&sortBy=lastUpdatedDate".format(keyword, max_results, link)
url = urllib.parse.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
response = urllib.request.urlopen(url).read().decode('utf-8')
feed = feedparser.parse(response)
# NOTE default columns: Title, Authors, Abstract, Link, Tags, Comment, Date
papers = []
for entry in feed.entries:
entry = EasyDict(entry)
paper = EasyDict()
# title
paper.Title = remove_duplicated_spaces(entry.title.replace("\n", " "))
# abstract
paper.Abstract = remove_duplicated_spaces(entry.summary.replace("\n", " "))
# authors
paper.Authors = [remove_duplicated_spaces(_["name"].replace("\n", " ")) for _ in entry.authors]
# link
paper.Link = remove_duplicated_spaces(entry.link.replace("\n", " "))
# tags
paper.Tags = [remove_duplicated_spaces(_["term"].replace("\n", " ")) for _ in entry.tags]
# comment
paper.Comment = remove_duplicated_spaces(entry.get("arxiv_comment", "").replace("\n", " "))
# date
paper.Date = entry.updated
papers.append(paper)
return papers
def filter_tags(papers: List[Dict[str, str]], target_fileds: List[str]=["cs"]) -> List[Dict[str, str]]:
# filtering tags: only keep the papers in target_fileds
results = []
for paper in papers:
tags = paper.Tags
for tag in tags:
if tag.split(".")[0] in target_fileds:
results.append(paper)
break
return results
def get_daily_papers_by_keyword_with_retries(keyword: str, column_names: List[str], max_result: int, link: str = "OR", retries: int = 6) -> List[Dict[str, str]]:
for _ in range(retries):
papers = get_daily_papers_by_keyword(keyword, column_names, max_result, link)
if len(papers) > 0: return papers
else:
print("Unexpected empty list, retrying...")
time.sleep(30 * 1) # wait for 30s
# failed
return None
def get_daily_papers_by_keyword(keyword: str, column_names: List[str], max_result: int, link: str = "OR") -> List[Dict[str, str]]:
# get papers
papers = request_paper_with_arXiv_api(keyword, max_result, link) # NOTE default columns: Title, Authors, Abstract, Link, Tags, Comment, Date
# NOTE filtering tags: only keep the papers in cs field
# TODO filtering more
papers = filter_tags(papers)
# select columns for display
papers = [{column_name: paper[column_name] for column_name in column_names} for paper in papers]
return papers
def generate_table(papers: List[Dict[str, str]], ignore_keys: List[str] = []) -> str:
formatted_papers = []
keys = papers[0].keys()
for paper in papers:
# process fixed columns
formatted_paper = EasyDict()
## Title and Link
formatted_paper.Title = "**" + "[{0}]({1})".format(paper["Title"], paper["Link"]) + "**"
## Process Date (format: 2021-08-01T00:00:00Z -> 2021-08-01)
formatted_paper.Date = paper["Date"].split("T")[0]
# process other columns
for key in keys:
if key in ["Title", "Link", "Date"] or key in ignore_keys:
continue
elif key == "Abstract":
# add show/hide button for abstract
formatted_paper[key] = "<details><summary>Show</summary><p>{0}</p></details>".format(paper[key])
elif key == "Authors":
# NOTE only use the first author
formatted_paper[key] = paper[key][0] + " et al."
elif key == "Tags":
tags = ", ".join(paper[key])
if len(tags) > 10:
formatted_paper[key] = "<details><summary>{0}...</summary><p>{1}</p></details>".format(tags[:5], tags)
else:
formatted_paper[key] = tags
elif key == "Comment":
if paper[key] == "":
formatted_paper[key] = ""
elif len(paper[key]) > 20:
formatted_paper[key] = "<details><summary>{0}...</summary><p>{1}</p></details>".format(paper[key][:5], paper[key])
else:
formatted_paper[key] = paper[key]
formatted_papers.append(formatted_paper)
# generate header
columns = formatted_papers[0].keys()
# highlight headers
columns = ["**" + column + "**" for column in columns]
header = "| " + " | ".join(columns) + " |"
header = header + "\n" + "| " + " | ".join(["---"] * len(formatted_papers[0].keys())) + " |"
# generate the body
body = ""
for paper in formatted_papers:
body += "\n| " + " | ".join(paper.values()) + " |"
return header + body
def back_up_files():
# back up README.md and ISSUE_TEMPLATE.md
shutil.move("README.md", "README.md.bk")
shutil.move(".github/ISSUE_TEMPLATE.md", ".github/ISSUE_TEMPLATE.md.bk")
def restore_files():
# restore README.md and ISSUE_TEMPLATE.md
shutil.move("README.md.bk", "README.md")
shutil.move(".github/ISSUE_TEMPLATE.md.bk", ".github/ISSUE_TEMPLATE.md")
def remove_backups():
# remove README.md and ISSUE_TEMPLATE.md
os.remove("README.md.bk")
os.remove(".github/ISSUE_TEMPLATE.md.bk")
def get_daily_date():
# get beijing time in the format of "March 1, 2021"
beijing_timezone = pytz.timezone('Asia/Shanghai')
today = datetime.datetime.now(beijing_timezone)
return today.strftime("%B %d, %Y")