-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbase.py
227 lines (177 loc) · 8.14 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""
TODO: Add doc string.
"""
import os
import numpy as np
import pandas as pd
from numpy import average
CLUSTERED_FILENAME_POSFIX = "_clustered"
CLUSTER_NAME_COLUMN_LABEL = "cluster_label"
SUM_PRE_CITATIONS_COLUMN_LABEL = "SumPreRawCitations"
SUM_POST_CITATIONS_COLUMN_LABEL = "SumPostRawCitations"
class Base(object):
"""
Base class containing common functionality to be used
by the derived types.
"""
def run(self, input_path):
raise NotImplementedError()
def get_repo_name(filename):
"""
Extracts repository name from the given filename.
:type filename: string
:param filename: The filename from which the repository
name should be extracted.
:rtype: string
:return: Repository name.
"""
filename = os.path.basename(filename)
return (os.path.splitext(filename)[0]).replace(CLUSTERED_FILENAME_POSFIX, "")
def get_files(path, extension="csv", include_clustered_files=False):
"""
Gets a list of absolute paths to files with given `extension` in the given `path`.
:type path: string
:param path: The path in which to search for the files.
:type extension: string
:param extension: Sets the extension of the files to search for in the given path.
:type include_clustered_files: boolean
:param include_clustered_files: If set to True, it will return only the files
with given extension whose filename ends with
`CLUSTERED_FILENAME_POSFIX`, otherwise if set
to False (default).
:rtype: list<string>
:return: A list of absolute paths to files in the given path that match given criteria.
"""
files = []
for root, dirpath, filenames in os.walk(path):
for filename in filenames:
if os.path.splitext(filename)[1] == ".csv":
is_clustered_file = \
os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX)
if (include_clustered_files and is_clustered_file) or \
(not include_clustered_files and not is_clustered_file):
files.append(os.path.join(root, filename))
return files
@staticmethod
def get_publications(filename):
"""
Reads publications from file with the given filename.
:type filename: string
:param filename: The name of the file from which publications should be read from.
:rtype: pandas.core.frame.DataFrame
:return: A dataframe that contains publications read from the given file.
"""
return pd.read_csv(filename, header=0, sep='\t')
@staticmethod
def get_clusters(filename):
"""
Returns a data-frame grouped-by cluster name.
:type filename: string
:param filename: Name of the file to be read.
:rtype: pandas.core.groupby.generic.DataFrameGroupBy
:return: A pandas data-frame grouped-by cluster name.
"""
dataframe = Base.get_publications(filename)
return dataframe.groupby(CLUSTER_NAME_COLUMN_LABEL)
def get_citations_headers(publications):
"""
Extracts the headers of columns containing citations of publications
from the given data frame.
This method assumes the consecutive columns with numerical headers
(starting from the first numerical header to the next non-numerical header)
contain the citation count of publications. The negative and positive
numerical headers are assumed to be containing citations belong to
before and after the tool was added to the repository, respectively.
:type publications: pandas.core.frame.DataFrame
:param publications: The dataframe from which to extract citations count.
:returns:
- list<string> pre: The headers of columns containing citation counts
before the tool was added to the repository.
- list<string> post: The headers of columns containing citation counts
after the tool was added to the repository.
"""
headers = publications.columns.values.tolist()
pre = []
post = []
s = False
for header in headers:
try:
v = float(header)
except ValueError:
if s: break
else: continue
s = True
if v < 0:
pre.append(header)
else:
post.append(header)
return pre, post
def get_vectors(publications, citations_per_year=False):
"""
:type publications: pandas.core.frame.DataFrame
:param publications: The dataframe from which to extract citations vectors.
:returns:
"""
pre_headers, post_headers = Base.get_citations_headers(publications)
# A list of two-dimensional lists, first dimension is pre counts
# and second dimension contains post citation counts.
citations = []
sums = []
deltas = []
# Lists contain citation counts before (pre) and after (post)
# a tool was added to the repository.
avg_pre = []
avg_pst = []
pre_citations = []
post_citations = []
for index, row in publications.iterrows():
pre_vals = row.get(pre_headers).values.tolist()
post_vals = row.get(post_headers).values.tolist()
pre_citations.append(pre_vals)
post_citations.append(post_vals)
citations.append(pre_vals + post_vals)
sums.append(np.sum(pre_vals + post_vals))
avg_pre.append(np.average(pre_vals))
avg_pst.append(np.average(post_vals))
if citations_per_year:
deltas.append(abs(np.average(post_vals) - np.average(pre_vals)))
else:
deltas.append(abs(np.max(post_vals) - np.max(pre_vals)))
return citations, pre_citations, post_citations, sums, avg_pre, avg_pst, deltas
@staticmethod
def get_raw_citations(publications):
deltas = []
pre_citations = []
post_citations = []
for index, row in publications.iterrows():
pre = row.get(SUM_PRE_CITATIONS_COLUMN_LABEL)
post = row.get(SUM_POST_CITATIONS_COLUMN_LABEL)
pre_citations.append(pre)
post_citations.append(post)
deltas.append(post-pre)
return pre_citations, post_citations, deltas
def get_sorted_clusters(publications):
"""
Computes the average of all the citation counts of all the publications in every cluster.
:type publications: pandas.core.groupby.generic.DataFrameGroupBy
:param publications: A dataframe grouped by clusters.
:returns:
- list<float> mapping: A sorted list of citation count average.
- dictionary cluster_avg_mapping:
A dictionary where keys are the cluster numbers
and values are the average of citation count of
publications in that cluster.
- dictionary avg_cluster_mapping:
A dictionary where keys are the average of
citation count of publications in a cluster
which is given by the value of that entry.
"""
cluster_avg_mapping = {}
avg_cluster_mapping = {}
for k in publications.groups:
citations, _, _, _, _, _, _ = Base.get_vectors(publications.get_group(k))
avg = average(citations)
cluster_avg_mapping[k] = avg
avg_cluster_mapping[avg] = k
sorted_avg = sorted(cluster_avg_mapping.values())
return cluster_avg_mapping, avg_cluster_mapping, sorted_avg