forked from millerlp/Misc_R_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauthor_year_plot.R
212 lines (196 loc) · 8.86 KB
/
author_year_plot.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# author_year_plot.R
#
# Author: Luke Miller 2015-04-22
###############################################################################
###############################################################################
# Export a text file from Endnote that only lists Year and Authors, all
# separated by commas. To do this, create an Output Style
# that lists the year followed by a comma and then each author separated by
# a comma. Select all references, then go to File>Export. In the window that
# opens, you'll see a menu for output style, choose your author-only version
# there and save the output file as text file.
f1 = 'authors_list_20150422.txt'
#
## Scan input file, divide each line into a separate entry in a character vector
authors = scan(file = f1, what = character(), sep = '\n')
#
yr = character()
# Extract year from each record.
for (i in 1:length(authors)){
yr[i] = substr(authors[i],regexpr('[1-2]',authors[i])[[1]],
regexpr(',',authors[i])[[1]] - 1)
}
yr = as.numeric(yr) # Convert to numbers
# Entries with missing or ambiguous years (anything with multiple years listed
# like 1997-2013) will end up as NA's in the yr vector, and will generate a
# warning.
cnt = numeric(length(yr)) # Create empty vector
# To count the number of authors on a paper, simply count the number of
# commas in each line of the authors vector. There is always one comma after
# the year, denoting at least one author, and every additional comma means there
# is another author.
for (i in 1:length(authors)){
cnt[i] = length(gregexpr(',',authors[i])[[1]])
}
# Pick out rows that don't have a useful year value
bad.entries = which(is.na(yr))
# Remove the offending rows from the yr and cnt vectors
yr = yr[-(bad.entries)]
cnt = cnt[-(bad.entries)]
# Make a data frame out of the yr and cnt vectors
df = data.frame(Year = yr, Count = cnt)
# Make a new dataframe that holds each combination of Year and Count
newdf = expand.grid(Years = unique(yr), Count = unique(cnt))
# Make a new column to hold a tally of the number of papers for each Year and
# author Count combination.
newdf$TotalPapers = NA
# Go through the combinations of years and counts to tally the number of papers
# that match that combo in the 'df' dataframe
for (i in 1:nrow(newdf)){
# Put the tally of number of papers matching each Year & Count combo in the
# TotalPapers column
newdf$TotalPapers[i] = nrow(df[df$Year == newdf$Year[i] &
df$Count == newdf$Count[i],])
}
# Drop any combinations where the TotalPapers was 0
newdf = newdf[-(which(newdf$TotalPapers == 0)),]
#########################################################
#########################################################
# Create a function to plot a color scale bar on the existing plot using the
# vector of colors that will be generated later by the colorRampPalette function
color.bar <- function(lut, min, max=-min, nticks=11,
x1 = 1, x2 = 2, y1 = 1, y2 = 2,
ticks=seq(min,max, length=nticks), round = TRUE, title = '',
cex.title = 1, text.col = 'black', horiz = FALSE){
# lut = a vector of color values, in hex format
# min = minimum value represented by the first color
# max = maximum value represented by the last color
# nticks = number of tick marks on the colorbar
# x1 = location of left edge of colorbar, in plot's x-units
# x2 = location of right edge of colorbar, in plot's x-units
# y1 = location of bottom edge of color bar, in plot's y-units
# y2 = location of top edge of color bar, in plot's y-units
# ticks = a sequence of tick mark value to be added to colorbar
# round = TRUE or FALSE, round off tick values to 0 decimal place.
# title = Title for colorbar
# cex.title = size for title
# text.col = color of tick marks, title, and border of colorbar
# horiz = TRUE or FALSE, lay out color bar vertically or horizontally
# Calculate a scaling factor based on the number of entries in the
# look-up-table and the absolute distance between y2 and y1 on the plot
if (horiz == FALSE){
scale = (length(lut)-1)/(y2-y1)
} else if (horiz == TRUE){
# For horizontal bars, use the distance between x2 and x1 instead
scale = (length(lut)-1)/(x2-x1)
}
# Round off the tick marks if desired
if (round) { ticks = round(ticks,0) }
# Draw little thin rectangles for each color in the look up table. The
# rectangles will span the distance between x1 and x2 on the plot's
# coordinates, and have a y-axis height scaled to fit all of the colors
# between y1 and y2 on the plot's coordinates. Each color will only be a
# small fraction of that overall height, using the scale factor. For a
# horizontal-oriented bar the thin rectangles will run between y1 and y2,
# scaled to fit all of the colors between x1 and x2.
for (i in 1:(length(lut)-1)) {
if (horiz == FALSE) {
# Calculate myy, the lower y-location of a rectangle
myy = (i-1)/scale + y1
# Calculate the upper y value as y+(1/scale), and draw the rectangle
rect(x1,myy,x2,myy+(1/scale), col=lut[i], border=NA)
} else if (horiz == TRUE) {
# Calculate x, the left x-location of a rectangle
myx = (i-1)/scale + x1
# Calculate the right x value as x+(1/scale), and draw the rectangle
rect(myx,y1,myx+(1/scale),y2, col=lut[i], border=NA)
}
}
# Draw a border around the color bar
rect(x1,y1,x2,y2, col = NULL, border = text.col)
# Draw tick marks and tick labels
for (i in 1:length(ticks)){
if (horiz == FALSE) {
myy = (ticks[i]-1)/scale + y1
# This is an attempt to set the tick mark and labels just off to the
# right side of the color bar without having them take up too much
# of the plot area. The x locations are calculated as x2 plus a
# fraction of the width of the rectangle.
myx2 = x2 + ((x2-x1)*0.1)
myx3 = x2 + ((x2-x1)*0.13)
# Draw little tick marks
lines(x = c(x2,myx2), y = c(myy,myy), col = text.col)
# Draw tick labels
text(x = myx3, y = myy, labels = ticks[i], adj = c(0,0.3),
col = text.col)
} else if (horiz == TRUE) {
# For a horizontal scale bar
myx = (ticks[i]-1)/scale + x1
# This is an attempt to set the tick mark and labels just below the
# bottom of the color bar without having them take up too much of
# the plot area. The y locations are calculated as y1 minus a
# fraction of the height of the rectangle
myy2 = y1 - ((y2-y1)*0.1)
myy3 = y1 - ((y2-y1)*0.13)
# Draw little tick marks
lines(x = c(myx,myx), y = c(y1,myy2), col = text.col)
# Draw tick labels
text(x = myx, y = myy3, labels = ticks[i], adj = c(0.5,1),
col = text.col)
}
}
# Draw a title for the color bar
text(x = ((x1+x2)/2), y = y2, labels = title, adj = c(0.5,-0.35),
cex = cex.title, col = text.col)
}
####################################################
####################################################
# Define a color ramp function from white to blue
# From ColorBrewer 9-class Blues (single-hue). ColorBrewer recommends the
# following set of 9 color values, expressed in hex format. I reverse them so
# that the highest value will be the lightest color.
colfun = colorRampPalette(rev(c("#f7fbff","#deebf7","#c6dbef","#9ecae1",
"#6baed6","#4292c6","#2171b5","#08519c","#08306b")),
space = 'Lab')
# Define a set of colors from blue to white using that function, covering the
# entire range of possible values for newdf$TotalPapers
cols = colfun(max(newdf$TotalPapers))
# Assign a color to each entry in the newdf data frame based on its TotalPapers
# value.
newdf$col = ""
for (i in 1:nrow(newdf)){
newdf$col[i] = cols[newdf$TotalPapers[i]]
}
##############################
# Create an output file in svg format
svg(filename = "author-year-count.svg", width = 9, height = 4.8)
par(mar =c(5,6,1,2)) # Change the figure margins slightly
plot(Count~Years, data = newdf, type = 'n',
ylim = c(0,45), las = 1,
cex.lab = 1.6,
cex.axis = 1.3,
ylab = 'Number of coauthors',
xlab = 'Publication Year',
yaxt = 'n')
# Color the background of the plot using a rectangle, and determine its
# dimensions on the fly by calling the par()$usr function to get the coordinates
# of the plot edges.
rect(par()$usr[1],par()$usr[3],par()$usr[2],par()$usr[4], col = "#BBBBBB")
# Draw some grid lines at useful locations
abline(h = c(1,2,3,4,5,10,15,20,25,30,35,40), col = "#CCCCCC")
abline(v = seq(1875,2015, by = 5), col = "#CCCCCC")
# Redraw the plot's bounding box to cover where the horizontal lines overwrite
# it.
box()
# Redraw the point data over the newly drawn background and horizontal lines
points(Count~Years, data = newdf, col = newdf$col, pch = 20, cex = 0.9)
# Call the color.bar function created earlier to create a color scale.
color.bar(lut = cols, nticks = 8, horiz = TRUE,
min = 1, max = max(newdf$TotalPapers),
x1 = 1880, x2 = 1920, y1 = 42, y2 = 44,
title = 'Number of papers', cex.title = 1.1, text.col = 'black')
# Draw the y-axis labels at the appropriate spots
axis(2, at = c(1,2,3,4,5,10,15,20,25,30,35,40),
labels = c('1','','3','','5','10','15','20','25','30','35','40'),
las = 1, cex.axis = 1.1)
dev.off()