Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
pablobarbera committed Dec 9, 2015
0 parents commit 4d563ac
Show file tree
Hide file tree
Showing 13 changed files with 383 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rhistory
.Rproj*
test/*
*.Rproj
13 changes: 13 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Package: scholarnetwork
Type: Package
Title: Extract and Visualize Google Scholar Collaboration Networks
Version: 0.1
Date: 2015-10-11
Author: Pablo Barbera <[email protected]>
Maintainer: Pablo Barbera <[email protected]>
Description: Extracts publication information from Google Scholar, create network of
collaborators based on co-authored projects, and visualize network using
a force-directed layout algorithm.
License: GPL-2
LazyData: TRUE
Depends: igraph, scholar, stringr, networkD3
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Generated by roxygen2 (4.1.1): do not edit by hand

export(extractNetwork)
export(plotNetwork)
import(igraph)
import(networkD3)
import(scholar)
importFrom(stringr,str_trim)
68 changes: 68 additions & 0 deletions R/extract-network.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#' extractNetwork
#' @export
#'
#' @title
#' Extract collaborators network from Google Scholar page
#'
#' @description
#' Uses \code{scholar} package to scrape Google Scholar page of an author
#' (determined by ID) and returns a list with a list of edges and a data frame
#' with node-level information
#'
#' @param id Character string specifying the Google Scholar ID.
#' @param n Maximum number of publications to retrieve.
#' @param largest_component If \code{TRUE}, keep only largest component in network
#' @param ... Other options to pass to \code{get_publications} function
#'
#' @examples \dontrun{
#' ## Download Google Scholar network data for a sample user
#' d <- extractNetwork(id="jGLKJUoAAAAJ", n=500)
#' ## Plot network into file called \code{network.html}
#' plotNetwork(d$nodes, d$edges, file="network.html")
#' }
#'

extractNetwork <- function(id, n=500, largest_component=TRUE, ...){

# downloading publications
pubs <- scholar::get_publications(id=id, pagesize=n, ...)

# converting to edges
edges <- lapply(pubs$author, extractAuthors)
edges <- do.call(rbind, edges)
edges <- aggregate(edges$weight,
by=list(node1=edges$node1, node2=edges$node2),
FUN=function(x) sum(x))
names(edges)[3] <- "weight"

### SELECT LARGEST COMPONENT

# extracting node-level information
network <- igraph::graph.edgelist(as.matrix(edges[,c("node1", "node2")]), directed=FALSE)
igraph::edge_attr(network, "weight") <- edges$weight
fc <- igraph::walktrap.community(network)
nodes <- data.frame(label = igraph::V(network)$name,
degree=igraph::strength(network), group=fc$membership,
stringsAsFactors=F)
nodes <- nodes[order(nodes$label),]

return(list(nodes=nodes, edges=edges))

}

extractAuthors <- function(x){
authors <- unlist(stringr::str_split(x, ","))
# deleting empty authors
authors <- authors[grepl('[A-Za-z]+', authors)]
# cleaning author list
authors <- stringr::str_trim(authors)
# if more than one author, create edge list
if (length(authors)>1){
edges <- as.data.frame(t(combn(x=authors, m=2)), stringsAsFactors=F)
names(edges) <- c("node1", "node2")
edges$weight <- 1/length(authors)
return(edges)
}
if (length(authors)<=1) return(NULL)
}

52 changes: 52 additions & 0 deletions R/plot-network.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#' plotNetwork
#' @export
#'
#' @title
#' Plot collaborators network from Google Scholar page
#'
#' @description
#' Takes value from \code{extractNetwork} function and visualizes network
#' using networkD3.
#'
#' @param nodes Data frame with node information returned by \code{extractNetwork}.
#' @param edges Data frame with edge list returned by \code{extractNetwork}.
#' @param file File where network visualization will be exported to.
#' @param width numeric width for the network graph's frame area in pixels
#' @param height numeric height for the network graph's frame area in pixels.
#' @param opacity numeric value of the proportion opaque you would like the graph elements to be.
#' @param fontsize numeric font size in pixels for the node text labels.
#' @param charge numeric value indicating either the strength of the node repulsion (negative value) or attraction (positive value).
#' @param ... Other options to pass to \code{networkD3} function
#'
#' #' @examples \dontrun{
#' ## Download Google Scholar network data for a sample user
#' d <- extractNetwork(id="jGLKJUoAAAAJ", n=500)
#' ## Plot network into file called \code{network.html}
#' plotNetwork(d$nodes, d$edges, file="network.html")
#' }
#'

plotNetwork <- function(nodes, edges, file='network.html', width=550,
height=400, opacity = .75, fontsize=10,
charge=-400,...){

df <- data.frame(
Source=as.numeric(factor(edges$node1, levels=nodes$label))-1,
Target=as.numeric(factor(edges$node2, levels=nodes$label))-1,
value=edges$weight)

output <- networkD3::forceNetwork(Links = df, Nodes = nodes, Source="Source", Target="Target",
NodeID = "label", Group = "group",linkWidth = 1,
Nodesize = "degree", fontSize=fontsize,
opacity = opacity, charge=charge,
width = width, height = height, ...)

saveNetwork(output, file, selfcontained = FALSE)

}

#d3Network::d3ForceNetwork(
# Links = df, Nodes = nodes, Source="Source", Target="Target",
# NodeID = "label", Group="group", width = width, height = height,
# opacity = opacity, file=file, fontsize=fontsize,
# linkDistance=linkDistance, ...)
14 changes: 14 additions & 0 deletions R/scholarnetwork-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#' Extract and Visualize Google Scholar Collaboration Networks
#'
#' This package provides functions to extracts publication information from
#' Google Scholar, create network of collaborators based on co-authored projects,
#' and visualize these networks using a force-directed layout algorithm.
#'
#' @seealso \code{\link{extractNetwork}}, \code{\link{plotNetwork}}
#' @name scholarnetwork-package
#' @aliases scholarnetwork
#' @docType package
#' @author Pablo Barbera \email{pablo.barbera@@nyu.edu}
#' @import scholar igraph networkD3
#' @importFrom stringr str_trim
NULL
123 changes: 123 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
<script src="https://raw.githubusercontent.com/ramnathv/htmlwidgets/master/inst/www/htmlwidgets.js"></script>
<script src="https://raw.githubusercontent.com/mbostock/d3/master/d3.min.js" charset="utf-8"></script>
<script src="https://raw.githubusercontent.com/christophergandrud/networkD3/master/inst/htmlwidgets/forceNetwork.js"></script>


# Extract and Visualize Google Scholar Collaboration Networks

**scholarnetwork** is an R package that provides functions to extracts publication information from Google Scholar, create network of collaborators based on co-authored projects, and visualize these networks using a force-directed layout algorithm.

## Installation ##

An initial release of this package is available in this repository (eventually maybe also on CRAN), and can be installed directly using Hadley Wickham's [devtools](http://cran.r-project.org/web/packages/devtools/index.html) package:

```
if(!require("devtools")) install.packages("devtools")
library("devtools")
install_github("pablobarbera/scholarnetwork")
```

## Examples ##

For now, the package consists of two functions, `extractNetwork` and `plotNetwork`, which correspond to the data collection and data visualization steps.

`extractNetwork` wraps the `get_publications` function from the scholar package, which extracts the list of publications on a Google Scholar profile, cleans it, and then parses the results into a format that is more suitable for network analysis:

- a data frame of __weighted edges__, where each edge is a collaboration in a publication, and the weight is one divided by number of co-authors; and

- a data frame with __node-level information__, which includes the group resulting from running a walktrap community detection algorithm.

```r
d <- extractNetwork(id="jGLKJUoAAAAJ", n=500)
str(d)
```
```
List of 2
$ nodes:'data.frame': 40 obs. of 3 variables:
..$ label : chr [1:40] "A Boydstun" "A Valeriani" "A Venetz" "C Roca Cuberes" ...
..$ degree: num [1:40] 0.75 1.69 0.75 0.667 1.69 ...
..$ group : num [1:40] 11 7 10 8 7 1 3 4 5 13 ...
$ edges:'data.frame': 106 obs. of 3 variables:
..$ node1 : chr [1:106] "P Barberá" "C Vaccari" "K Ackermann" "P Barberá" ...
..$ node2 : chr [1:106] "A Boydstun" "A Valeriani" "A Venetz" "A Venetz" ...
..$ weight: num [1:106] 0.25 0.31 0.25 0.25 0.25 ...
```

`plotNetwork` takes the lists of nodes and edges returned by `extractNetwork` and visualizes it using `networkD3`. The output of this function is an html file with the network visualization. Note that this function will also work with any other set of edge and node lists.

```r
plotNetwork(d$nodes, d$edges, file="network.html")
```
<div id="htmlwidget_container">
<div id="htmlwidget-491" style="width:550px;height:400px;" class="forceNetwork"></div>
</div>
<script type="application/json" data-for="htmlwidget-491">{"x":{"links":{"source":[26,4,21,26,31,11,16,22,25,23,26,26,29,26,26,36,26,5,6,22,23,26,32,0,1,4,5,13,20,22,24,26,27,32,35,33,1,4,6,11,13,14,20,23,24,25,26,27,32,26,30,34,14,20,26,17,26,30,34,1,4,24,26,27,32,26,31,26,26,11,1,3,4,5,6,22,23,28,29,30,31,38,26,32,3,7,26,1,4,5,14,18,20,22,24,26,26,30,0,26,26,38,39,26,26,38],"target":[0,1,2,2,2,5,5,5,5,6,7,8,8,9,10,10,12,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,15,16,16,16,16,16,16,16,16,16,16,16,16,16,17,17,17,18,18,18,19,19,19,19,20,20,20,20,20,20,21,21,22,24,25,26,26,26,26,26,26,26,26,26,26,26,26,27,27,28,30,30,32,32,32,32,32,32,32,32,32,34,34,35,35,36,36,36,37,39,39]},"nodes":{"name":["A Boydstun","A Valeriani","A Venetz","C Roca Cuberes","C Vaccari","D Penfold-Brown","E Borra","E Dinas","G Rivero","G Sood","H Schmitt","I Cioroianu","J Arregui","J Jost","J Nagler","J Subirats","J Tucker","JA Mayoral","JA Tucker","JR Montero","JT Jost","K Ackermann","M Metzger","N Hassanpour","N Wang","NYU Jonathan Nagler","P Barberá","P Egan","P Estelrich Arce","P Fernández-Vázquez","P Riera","PC Bauer","R Bonneau","R Gallego","R Gómez","S Linn","SA Popa","T Zeitzoff","Y Theocharis","Z Fazekas"],"group":[11,7,10,8,7,1,3,4,5,13,2,1,14,3,7,12,3,9,7,9,7,10,6,3,7,1,6,7,8,5,4,10,7,12,9,11,2,15,2,2],"nodesize":[0.75,1.69047619047619,0.75,0.666666666666667,1.69047619047619,1.58333333333333,0.8,0.666666666666667,1.66666666666667,0.5,0.666666666666667,0.75,0.5,1.63333333333333,5.74047619047619,0.5,4.07380952380952,0.8,0.8,0.8,4.15714285714286,0.75,2.33333333333333,0.8,0.833333333333333,0.75,15.5071428571429,0.833333333333333,0.666666666666667,0.666666666666667,1.46666666666667,0.75,4.99047619047619,0.5,0.8,0.75,1.41666666666667,0.5,0.75,0.75]},"options":{"NodeID":"label","Group":"group","colourScale":"d3.scale.category20()","fontSize":10,"fontFamily":"serif","clickTextSize":25,"linkDistance":50,"linkWidth":"1","charge":-500,"linkColour":"#666","opacity":0.75,"zoom":false,"legend":false,"nodesize":true,"radiusCalculation":" Math.sqrt(d.nodesize)+6","bounded":false,"opacityNoHover":0,"clickAction":null}},"evals":[]}</script>
<script type="application/htmlwidget-sizing" data-for="htmlwidget-491">{"viewer":{"width":550,"height":400,"padding":10,"fill":false},"browser":{"width":550,"height":400,"padding":10,"fill":false}}</script>



The output of the `extractNetwork` function can also be used to generate a static version of this visualization with e.g. `ggplot2`:

```r
library(ggplot2)
library(igraph)
# cleaning network data
network <- graph_from_data_frame(d$edges, directed=FALSE)
set.seed(123)
l <- layout.fruchterman.reingold(network, niter=1500) # layout
fc <- walktrap.community(network) # community detection

# node locations
nodes <- data.frame(l); names(nodes) <- c("x", "y")
nodes$cluster <- factor(fc$membership)
nodes$label <- fc$names
nodes$degree <- degree(network)

# edge locations
edgelist <- get.edgelist(network, names=FALSE)
edges <- data.frame(nodes[edgelist[,1],c("x", "y")], nodes[edgelist[,2],c("x", "y")])
names(edges) <- c("x1", "y1", "x2", "y2")

# and now visualizing it...
p <- ggplot(nodes, aes(x=x, y=y, color=cluster, label=label, size=degree))
pq <- p + geom_text(color="black", aes(label=label, size=degree),
show_guide=FALSE) +
# nodes
geom_point(color="grey20", aes(fill=cluster),
shape=21, show_guide=FALSE, alpha=1/2) +
# edges
geom_segment(
aes(x=x1, y=y1, xend=x2, yend=y2, label=NA),
data=edges, size=0.25, color="grey20", alpha=1/5) +
## note that here I add a border to the points
scale_fill_discrete(labels=labels) +
scale_size_continuous(range = c(5, 8)) +
theme(
panel.background = element_rect(fill = "white"),
plot.background = element_rect(fill="white"),
axis.line = element_blank(), axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(), panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(colour = F, fill = "black"),
legend.key = element_rect(fill = "black", colour = F),
legend.title = element_text(color="white"),
legend.text = element_text(color="white")
) +
## changing size of points in legend
guides(fill = guide_legend(override.aes = list(size=5)))

pq
```

<center><img src="img/network.png" style="width: 550px;"/></center>

However, it is difficult to make sure labels do not overlap. A probably better option is to export the network data to a format that Gephi can read, and then edit it manually in Gephi, as shown below.

```r
df <- data.frame(Source = d$edges$node1, Target = d$edges$node2)
write.csv(df, file="edgelist-gephi.csv", row.names=FALSE)
```

<center><img src="img/collaborator-network.png" style="width: 550px;"/></center>
Binary file added img/collaborator-network.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/network.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
34 changes: 34 additions & 0 deletions man/extractNetwork.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/extract-network.R
\name{extractNetwork}
\alias{extractNetwork}
\title{Extract collaborators network from Google Scholar page}
\usage{
extractNetwork(id, n = 500, largest_component = TRUE, ...)
}
\arguments{
\item{id}{Character string specifying the Google Scholar ID.}

\item{n}{Maximum number of publications to retrieve.}

\item{largest_component}{If \code{TRUE}, keep only largest component in network}

\item{...}{Other options to pass to \code{get_publications} function}
}
\description{
Uses \code{scholar} package to scrape Google Scholar page of an author
(determined by ID) and returns a list with a list of edges and a data frame
with node-level information
}
\details{
extractNetwork
}
\examples{
\dontrun{
## Download Google Scholar network data for a sample user
d <- extractNetwork(id="jGLKJUoAAAAJ", n=500)
## Plot network into file called \\code{network.html}
plotNetwork(d$nodes, d$edges, file="network.html")
}
}

46 changes: 46 additions & 0 deletions man/plotNetwork.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/plot-network.R
\name{plotNetwork}
\alias{plotNetwork}
\title{Plot collaborators network from Google Scholar page}
\usage{
plotNetwork(nodes, edges, file = "network.html", width = 550,
height = 400, opacity = 0.75, fontsize = 10, charge = -500, ...)
}
\arguments{
\item{nodes}{Data frame with node information returned by \code{extractNetwork}.}

\item{edges}{Data frame with edge list returned by \code{extractNetwork}.}

\item{file}{File where network visualization will be exported to.}

\item{width}{numeric width for the network graph's frame area in pixels}
\item{height}{numeric height for the network graph's frame area in pixels.}

\item{opacity}{numeric value of the proportion opaque you would like the graph elements to be.}

\item{fontsize}{numeric font size in pixels for the node text labels.}

\item{charge}{numeric value indicating either the strength of the node repulsion (negative value) or attraction (positive value).}

\item{...}{Other options to pass to \code{networkD3} function

#'}
}
\description{
Takes value from \code{extractNetwork} function and visualizes network
using networkD3.
}
\details{
plotNetwork
}
\examples{
\dontrun{
## Download Google Scholar network data for a sample user
d <- extractNetwork(id="jGLKJUoAAAAJ", n=500)
## Plot network into file called \\code{network.html}
plotNetwork(d$nodes, d$edges, file="network.html")
}
}

19 changes: 19 additions & 0 deletions man/scholarnetwork-package.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/scholarnetwork-package.R
\docType{package}
\name{scholarnetwork-package}
\alias{scholarnetwork}
\alias{scholarnetwork-package}
\title{Extract and Visualize Google Scholar Collaboration Networks}
\description{
This package provides functions to extracts publication information from
Google Scholar, create network of collaborators based on co-authored projects,
and visualize these networks using a force-directed layout algorithm.
}
\author{
Pablo Barbera \email{pablo.barbera@nyu.edu}
}
\seealso{
\code{\link{extractNetwork}}, \code{\link{plotNetwork}}
}

0 comments on commit 4d563ac

Please sign in to comment.