forked from pablobarbera/scholarnetwork
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 4d563ac
Showing
13 changed files
with
383 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
^.*\.Rproj$ | ||
^\.Rproj\.user$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.Rhistory | ||
.Rproj* | ||
test/* | ||
*.Rproj |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Package: scholarnetwork | ||
Type: Package | ||
Title: Extract and Visualize Google Scholar Collaboration Networks | ||
Version: 0.1 | ||
Date: 2015-10-11 | ||
Author: Pablo Barbera <[email protected]> | ||
Maintainer: Pablo Barbera <[email protected]> | ||
Description: Extracts publication information from Google Scholar, create network of | ||
collaborators based on co-authored projects, and visualize network using | ||
a force-directed layout algorithm. | ||
License: GPL-2 | ||
LazyData: TRUE | ||
Depends: igraph, scholar, stringr, networkD3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Generated by roxygen2 (4.1.1): do not edit by hand | ||
|
||
export(extractNetwork) | ||
export(plotNetwork) | ||
import(igraph) | ||
import(networkD3) | ||
import(scholar) | ||
importFrom(stringr,str_trim) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#' extractNetwork | ||
#' @export | ||
#' | ||
#' @title | ||
#' Extract collaborators network from Google Scholar page | ||
#' | ||
#' @description | ||
#' Uses \code{scholar} package to scrape Google Scholar page of an author | ||
#' (determined by ID) and returns a list with a list of edges and a data frame | ||
#' with node-level information | ||
#' | ||
#' @param id Character string specifying the Google Scholar ID. | ||
#' @param n Maximum number of publications to retrieve. | ||
#' @param largest_component If \code{TRUE}, keep only largest component in network | ||
#' @param ... Other options to pass to \code{get_publications} function | ||
#' | ||
#' @examples \dontrun{ | ||
#' ## Download Google Scholar network data for a sample user | ||
#' d <- extractNetwork(id="jGLKJUoAAAAJ", n=500) | ||
#' ## Plot network into file called \code{network.html} | ||
#' plotNetwork(d$nodes, d$edges, file="network.html") | ||
#' } | ||
#' | ||
|
||
extractNetwork <- function(id, n=500, largest_component=TRUE, ...){ | ||
|
||
# downloading publications | ||
pubs <- scholar::get_publications(id=id, pagesize=n, ...) | ||
|
||
# converting to edges | ||
edges <- lapply(pubs$author, extractAuthors) | ||
edges <- do.call(rbind, edges) | ||
edges <- aggregate(edges$weight, | ||
by=list(node1=edges$node1, node2=edges$node2), | ||
FUN=function(x) sum(x)) | ||
names(edges)[3] <- "weight" | ||
|
||
### SELECT LARGEST COMPONENT | ||
|
||
# extracting node-level information | ||
network <- igraph::graph.edgelist(as.matrix(edges[,c("node1", "node2")]), directed=FALSE) | ||
igraph::edge_attr(network, "weight") <- edges$weight | ||
fc <- igraph::walktrap.community(network) | ||
nodes <- data.frame(label = igraph::V(network)$name, | ||
degree=igraph::strength(network), group=fc$membership, | ||
stringsAsFactors=F) | ||
nodes <- nodes[order(nodes$label),] | ||
|
||
return(list(nodes=nodes, edges=edges)) | ||
|
||
} | ||
|
||
extractAuthors <- function(x){ | ||
authors <- unlist(stringr::str_split(x, ",")) | ||
# deleting empty authors | ||
authors <- authors[grepl('[A-Za-z]+', authors)] | ||
# cleaning author list | ||
authors <- stringr::str_trim(authors) | ||
# if more than one author, create edge list | ||
if (length(authors)>1){ | ||
edges <- as.data.frame(t(combn(x=authors, m=2)), stringsAsFactors=F) | ||
names(edges) <- c("node1", "node2") | ||
edges$weight <- 1/length(authors) | ||
return(edges) | ||
} | ||
if (length(authors)<=1) return(NULL) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#' plotNetwork | ||
#' @export | ||
#' | ||
#' @title | ||
#' Plot collaborators network from Google Scholar page | ||
#' | ||
#' @description | ||
#' Takes value from \code{extractNetwork} function and visualizes network | ||
#' using networkD3. | ||
#' | ||
#' @param nodes Data frame with node information returned by \code{extractNetwork}. | ||
#' @param edges Data frame with edge list returned by \code{extractNetwork}. | ||
#' @param file File where network visualization will be exported to. | ||
#' @param width numeric width for the network graph's frame area in pixels | ||
#' @param height numeric height for the network graph's frame area in pixels. | ||
#' @param opacity numeric value of the proportion opaque you would like the graph elements to be. | ||
#' @param fontsize numeric font size in pixels for the node text labels. | ||
#' @param charge numeric value indicating either the strength of the node repulsion (negative value) or attraction (positive value). | ||
#' @param ... Other options to pass to \code{networkD3} function | ||
#' | ||
#' #' @examples \dontrun{ | ||
#' ## Download Google Scholar network data for a sample user | ||
#' d <- extractNetwork(id="jGLKJUoAAAAJ", n=500) | ||
#' ## Plot network into file called \code{network.html} | ||
#' plotNetwork(d$nodes, d$edges, file="network.html") | ||
#' } | ||
#' | ||
|
||
plotNetwork <- function(nodes, edges, file='network.html', width=550, | ||
height=400, opacity = .75, fontsize=10, | ||
charge=-400,...){ | ||
|
||
df <- data.frame( | ||
Source=as.numeric(factor(edges$node1, levels=nodes$label))-1, | ||
Target=as.numeric(factor(edges$node2, levels=nodes$label))-1, | ||
value=edges$weight) | ||
|
||
output <- networkD3::forceNetwork(Links = df, Nodes = nodes, Source="Source", Target="Target", | ||
NodeID = "label", Group = "group",linkWidth = 1, | ||
Nodesize = "degree", fontSize=fontsize, | ||
opacity = opacity, charge=charge, | ||
width = width, height = height, ...) | ||
|
||
saveNetwork(output, file, selfcontained = FALSE) | ||
|
||
} | ||
|
||
#d3Network::d3ForceNetwork( | ||
# Links = df, Nodes = nodes, Source="Source", Target="Target", | ||
# NodeID = "label", Group="group", width = width, height = height, | ||
# opacity = opacity, file=file, fontsize=fontsize, | ||
# linkDistance=linkDistance, ...) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#' Extract and Visualize Google Scholar Collaboration Networks | ||
#' | ||
#' This package provides functions to extracts publication information from | ||
#' Google Scholar, create network of collaborators based on co-authored projects, | ||
#' and visualize these networks using a force-directed layout algorithm. | ||
#' | ||
#' @seealso \code{\link{extractNetwork}}, \code{\link{plotNetwork}} | ||
#' @name scholarnetwork-package | ||
#' @aliases scholarnetwork | ||
#' @docType package | ||
#' @author Pablo Barbera \email{pablo.barbera@@nyu.edu} | ||
#' @import scholar igraph networkD3 | ||
#' @importFrom stringr str_trim | ||
NULL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
<script src="https://raw.githubusercontent.com/ramnathv/htmlwidgets/master/inst/www/htmlwidgets.js"></script> | ||
<script src="https://raw.githubusercontent.com/mbostock/d3/master/d3.min.js" charset="utf-8"></script> | ||
<script src="https://raw.githubusercontent.com/christophergandrud/networkD3/master/inst/htmlwidgets/forceNetwork.js"></script> | ||
|
||
|
||
# Extract and Visualize Google Scholar Collaboration Networks | ||
|
||
**scholarnetwork** is an R package that provides functions to extracts publication information from Google Scholar, create network of collaborators based on co-authored projects, and visualize these networks using a force-directed layout algorithm. | ||
|
||
## Installation ## | ||
|
||
An initial release of this package is available in this repository (eventually maybe also on CRAN), and can be installed directly using Hadley Wickham's [devtools](http://cran.r-project.org/web/packages/devtools/index.html) package: | ||
|
||
``` | ||
if(!require("devtools")) install.packages("devtools") | ||
library("devtools") | ||
install_github("pablobarbera/scholarnetwork") | ||
``` | ||
|
||
## Examples ## | ||
|
||
For now, the package consists of two functions, `extractNetwork` and `plotNetwork`, which correspond to the data collection and data visualization steps. | ||
|
||
`extractNetwork` wraps the `get_publications` function from the scholar package, which extracts the list of publications on a Google Scholar profile, cleans it, and then parses the results into a format that is more suitable for network analysis: | ||
|
||
- a data frame of __weighted edges__, where each edge is a collaboration in a publication, and the weight is one divided by number of co-authors; and | ||
|
||
- a data frame with __node-level information__, which includes the group resulting from running a walktrap community detection algorithm. | ||
|
||
```r | ||
d <- extractNetwork(id="jGLKJUoAAAAJ", n=500) | ||
str(d) | ||
``` | ||
``` | ||
List of 2 | ||
$ nodes:'data.frame': 40 obs. of 3 variables: | ||
..$ label : chr [1:40] "A Boydstun" "A Valeriani" "A Venetz" "C Roca Cuberes" ... | ||
..$ degree: num [1:40] 0.75 1.69 0.75 0.667 1.69 ... | ||
..$ group : num [1:40] 11 7 10 8 7 1 3 4 5 13 ... | ||
$ edges:'data.frame': 106 obs. of 3 variables: | ||
..$ node1 : chr [1:106] "P Barberá" "C Vaccari" "K Ackermann" "P Barberá" ... | ||
..$ node2 : chr [1:106] "A Boydstun" "A Valeriani" "A Venetz" "A Venetz" ... | ||
..$ weight: num [1:106] 0.25 0.31 0.25 0.25 0.25 ... | ||
``` | ||
|
||
`plotNetwork` takes the lists of nodes and edges returned by `extractNetwork` and visualizes it using `networkD3`. The output of this function is an html file with the network visualization. Note that this function will also work with any other set of edge and node lists. | ||
|
||
```r | ||
plotNetwork(d$nodes, d$edges, file="network.html") | ||
``` | ||
<div id="htmlwidget_container"> | ||
<div id="htmlwidget-491" style="width:550px;height:400px;" class="forceNetwork"></div> | ||
</div> | ||
<script type="application/json" data-for="htmlwidget-491">{"x":{"links":{"source":[26,4,21,26,31,11,16,22,25,23,26,26,29,26,26,36,26,5,6,22,23,26,32,0,1,4,5,13,20,22,24,26,27,32,35,33,1,4,6,11,13,14,20,23,24,25,26,27,32,26,30,34,14,20,26,17,26,30,34,1,4,24,26,27,32,26,31,26,26,11,1,3,4,5,6,22,23,28,29,30,31,38,26,32,3,7,26,1,4,5,14,18,20,22,24,26,26,30,0,26,26,38,39,26,26,38],"target":[0,1,2,2,2,5,5,5,5,6,7,8,8,9,10,10,12,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,15,16,16,16,16,16,16,16,16,16,16,16,16,16,17,17,17,18,18,18,19,19,19,19,20,20,20,20,20,20,21,21,22,24,25,26,26,26,26,26,26,26,26,26,26,26,26,27,27,28,30,30,32,32,32,32,32,32,32,32,32,34,34,35,35,36,36,36,37,39,39]},"nodes":{"name":["A Boydstun","A Valeriani","A Venetz","C Roca Cuberes","C Vaccari","D Penfold-Brown","E Borra","E Dinas","G Rivero","G Sood","H Schmitt","I Cioroianu","J Arregui","J Jost","J Nagler","J Subirats","J Tucker","JA Mayoral","JA Tucker","JR Montero","JT Jost","K Ackermann","M Metzger","N Hassanpour","N Wang","NYU Jonathan Nagler","P Barberá","P Egan","P Estelrich Arce","P Fernández-Vázquez","P Riera","PC Bauer","R Bonneau","R Gallego","R Gómez","S Linn","SA Popa","T Zeitzoff","Y Theocharis","Z Fazekas"],"group":[11,7,10,8,7,1,3,4,5,13,2,1,14,3,7,12,3,9,7,9,7,10,6,3,7,1,6,7,8,5,4,10,7,12,9,11,2,15,2,2],"nodesize":[0.75,1.69047619047619,0.75,0.666666666666667,1.69047619047619,1.58333333333333,0.8,0.666666666666667,1.66666666666667,0.5,0.666666666666667,0.75,0.5,1.63333333333333,5.74047619047619,0.5,4.07380952380952,0.8,0.8,0.8,4.15714285714286,0.75,2.33333333333333,0.8,0.833333333333333,0.75,15.5071428571429,0.833333333333333,0.666666666666667,0.666666666666667,1.46666666666667,0.75,4.99047619047619,0.5,0.8,0.75,1.41666666666667,0.5,0.75,0.75]},"options":{"NodeID":"label","Group":"group","colourScale":"d3.scale.category20()","fontSize":10,"fontFamily":"serif","clickTextSize":25,"linkDistance":50,"linkWidth":"1","charge":-500,"linkColour":"#666","opacity":0.75,"zoom":false,"legend":false,"nodesize":true,"radiusCalculation":" Math.sqrt(d.nodesize)+6","bounded":false,"opacityNoHover":0,"clickAction":null}},"evals":[]}</script> | ||
<script type="application/htmlwidget-sizing" data-for="htmlwidget-491">{"viewer":{"width":550,"height":400,"padding":10,"fill":false},"browser":{"width":550,"height":400,"padding":10,"fill":false}}</script> | ||
|
||
|
||
|
||
The output of the `extractNetwork` function can also be used to generate a static version of this visualization with e.g. `ggplot2`: | ||
|
||
```r | ||
library(ggplot2) | ||
library(igraph) | ||
# cleaning network data | ||
network <- graph_from_data_frame(d$edges, directed=FALSE) | ||
set.seed(123) | ||
l <- layout.fruchterman.reingold(network, niter=1500) # layout | ||
fc <- walktrap.community(network) # community detection | ||
|
||
# node locations | ||
nodes <- data.frame(l); names(nodes) <- c("x", "y") | ||
nodes$cluster <- factor(fc$membership) | ||
nodes$label <- fc$names | ||
nodes$degree <- degree(network) | ||
|
||
# edge locations | ||
edgelist <- get.edgelist(network, names=FALSE) | ||
edges <- data.frame(nodes[edgelist[,1],c("x", "y")], nodes[edgelist[,2],c("x", "y")]) | ||
names(edges) <- c("x1", "y1", "x2", "y2") | ||
|
||
# and now visualizing it... | ||
p <- ggplot(nodes, aes(x=x, y=y, color=cluster, label=label, size=degree)) | ||
pq <- p + geom_text(color="black", aes(label=label, size=degree), | ||
show_guide=FALSE) + | ||
# nodes | ||
geom_point(color="grey20", aes(fill=cluster), | ||
shape=21, show_guide=FALSE, alpha=1/2) + | ||
# edges | ||
geom_segment( | ||
aes(x=x1, y=y1, xend=x2, yend=y2, label=NA), | ||
data=edges, size=0.25, color="grey20", alpha=1/5) + | ||
## note that here I add a border to the points | ||
scale_fill_discrete(labels=labels) + | ||
scale_size_continuous(range = c(5, 8)) + | ||
theme( | ||
panel.background = element_rect(fill = "white"), | ||
plot.background = element_rect(fill="white"), | ||
axis.line = element_blank(), axis.text = element_blank(), | ||
axis.ticks = element_blank(), | ||
axis.title = element_blank(), panel.border = element_blank(), | ||
panel.grid.major = element_blank(), | ||
panel.grid.minor = element_blank(), | ||
legend.background = element_rect(colour = F, fill = "black"), | ||
legend.key = element_rect(fill = "black", colour = F), | ||
legend.title = element_text(color="white"), | ||
legend.text = element_text(color="white") | ||
) + | ||
## changing size of points in legend | ||
guides(fill = guide_legend(override.aes = list(size=5))) | ||
|
||
pq | ||
``` | ||
|
||
<center><img src="img/network.png" style="width: 550px;"/></center> | ||
|
||
However, it is difficult to make sure labels do not overlap. A probably better option is to export the network data to a format that Gephi can read, and then edit it manually in Gephi, as shown below. | ||
|
||
```r | ||
df <- data.frame(Source = d$edges$node1, Target = d$edges$node2) | ||
write.csv(df, file="edgelist-gephi.csv", row.names=FALSE) | ||
``` | ||
|
||
<center><img src="img/collaborator-network.png" style="width: 550px;"/></center> |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
% Generated by roxygen2 (4.1.1): do not edit by hand | ||
% Please edit documentation in R/extract-network.R | ||
\name{extractNetwork} | ||
\alias{extractNetwork} | ||
\title{Extract collaborators network from Google Scholar page} | ||
\usage{ | ||
extractNetwork(id, n = 500, largest_component = TRUE, ...) | ||
} | ||
\arguments{ | ||
\item{id}{Character string specifying the Google Scholar ID.} | ||
|
||
\item{n}{Maximum number of publications to retrieve.} | ||
|
||
\item{largest_component}{If \code{TRUE}, keep only largest component in network} | ||
|
||
\item{...}{Other options to pass to \code{get_publications} function} | ||
} | ||
\description{ | ||
Uses \code{scholar} package to scrape Google Scholar page of an author | ||
(determined by ID) and returns a list with a list of edges and a data frame | ||
with node-level information | ||
} | ||
\details{ | ||
extractNetwork | ||
} | ||
\examples{ | ||
\dontrun{ | ||
## Download Google Scholar network data for a sample user | ||
d <- extractNetwork(id="jGLKJUoAAAAJ", n=500) | ||
## Plot network into file called \\code{network.html} | ||
plotNetwork(d$nodes, d$edges, file="network.html") | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
% Generated by roxygen2 (4.1.1): do not edit by hand | ||
% Please edit documentation in R/plot-network.R | ||
\name{plotNetwork} | ||
\alias{plotNetwork} | ||
\title{Plot collaborators network from Google Scholar page} | ||
\usage{ | ||
plotNetwork(nodes, edges, file = "network.html", width = 550, | ||
height = 400, opacity = 0.75, fontsize = 10, charge = -500, ...) | ||
} | ||
\arguments{ | ||
\item{nodes}{Data frame with node information returned by \code{extractNetwork}.} | ||
|
||
\item{edges}{Data frame with edge list returned by \code{extractNetwork}.} | ||
|
||
\item{file}{File where network visualization will be exported to.} | ||
|
||
\item{width}{numeric width for the network graph's frame area in pixels} | ||
\item{height}{numeric height for the network graph's frame area in pixels.} | ||
|
||
\item{opacity}{numeric value of the proportion opaque you would like the graph elements to be.} | ||
|
||
\item{fontsize}{numeric font size in pixels for the node text labels.} | ||
|
||
\item{charge}{numeric value indicating either the strength of the node repulsion (negative value) or attraction (positive value).} | ||
|
||
\item{...}{Other options to pass to \code{networkD3} function | ||
|
||
#'} | ||
} | ||
\description{ | ||
Takes value from \code{extractNetwork} function and visualizes network | ||
using networkD3. | ||
} | ||
\details{ | ||
plotNetwork | ||
} | ||
\examples{ | ||
\dontrun{ | ||
## Download Google Scholar network data for a sample user | ||
d <- extractNetwork(id="jGLKJUoAAAAJ", n=500) | ||
## Plot network into file called \\code{network.html} | ||
plotNetwork(d$nodes, d$edges, file="network.html") | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
% Generated by roxygen2 (4.1.1): do not edit by hand | ||
% Please edit documentation in R/scholarnetwork-package.R | ||
\docType{package} | ||
\name{scholarnetwork-package} | ||
\alias{scholarnetwork} | ||
\alias{scholarnetwork-package} | ||
\title{Extract and Visualize Google Scholar Collaboration Networks} | ||
\description{ | ||
This package provides functions to extracts publication information from | ||
Google Scholar, create network of collaborators based on co-authored projects, | ||
and visualize these networks using a force-directed layout algorithm. | ||
} | ||
\author{ | ||
Pablo Barbera \email{pablo.barbera@nyu.edu} | ||
} | ||
\seealso{ | ||
\code{\link{extractNetwork}}, \code{\link{plotNetwork}} | ||
} | ||
|