tidied data parsing and updated readme

mkearney · Aug 30, 2017 · 7866832 · 7866832
1 parent 364b954
commit 7866832
Show file tree

Hide file tree

Showing 10 changed files with 155 additions and 116 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,3 +8,4 @@ export(as.tibble.sentiment_analysis_list)
 export(as_tibble.sentiment_analysis)
 export(as_tibble.sentiment_analysis_list)
 export(googleapis_token)
+export(prep_text)
diff --git a/R/analyze_sentiment.R b/R/analyze_sentiment.R
@@ -1,30 +1,45 @@
 
 #' analyze_sentiment
 #'
-#' Returns sentiment analyzis from Google cloud language API.
+#' Conducts and returns results of sentiment analysis using Google Cloud Natural
+#'   Language API.
 #'
-#' @param data Vector of plain text to analyze.
+#' @param text Vector of plain text to analyze.
 #' @return List of parsed response objects.
 #' @export
 #' @aliases analyse_sentiment
-analyze_sentiment <- function(data) {
-  eval(call("analyze_sentiment_", data))
+analyze_sentiment <- function(text, id = NULL) {
+  eval(call("analyze_sentiment_", text))
 }
 
-analyze_sentiment_ <- function(text) {
+analyze_sentiment_ <- function(text, id = NULL) {
   analyze_sentiment_internal <- function(text) {
     ## API path
     path <- "analyzeSentiment"
     ## format text for request
-    text <- jsonify_text(text)
+    jstext <- jsonify_text(text)
     ## execute request
-    r <- httr::POST(api_call(path), body = text)
+    r <- httr::POST(api_call(path), body = jstext)
     ## parse
     r <- parse_docs(r)
-    class(r) <- c("sentiment_analysis", "list")
-    r
+    structure(
+      .Data = r,
+      class = c("sentiment_analysis", "list"),
+      text = text,
+      id = id
+    )
+    ##class(r) <- c("sentiment_analysis", "list")
+    ##r
+  }
+  if (!is.null(id)) {
+    stopifnot(length(text) == length(id))
+    out <- Map(analyze_sentiment_internal, text, id)
+  } else {
+    out <- Map(analyze_sentiment_internal, text)
+    for (i in seq_along(out)) {
+      attr(out[[i]], "id") <- i
+    }
   }
-  out <- lapply(text, analyze_sentiment_internal)
   class(out) <- c("sentiment_analysis_list", "list")
   out
 }
@@ -41,24 +56,47 @@ as.data.frame.sentiment_analysis <- function(x) {
   if (!has_name(x, "documentSentiment", "sentences")) {
     return(data.frame())
   }
-  document <- x$documentSentiment[["score"]]
+  doc_score <- x$documentSentiment[["score"]]
+  doc_magnitude <- x$documentSentiment[["magnitude"]]
   content <- get_var(x$sentences, "text", "content")
   offset <- get_var(x$sentences, "text", "beginOffset")
   score <- get_var(x$sentences, "sentiment", "score")
-  if (!all.equal(length(content), length(offset), length(score))) {
+  magnitude <- get_var(x$sentences, "sentiment", "score")
+  id <- attr(x, "id")
+  text <- attr(x, "text")
+  lns <- c(
+    length(content),
+    length(offset),
+    length(score),
+    length(magnitude)
+  )
+  if (!all.equal(lns[1], lns[2], lns[3], lns[4])) {
     content <- content[1]
     offset <- offset[1]
     score <- score[1]
+    magnitude <- magnitude[1]
   }
-  data.frame(
-    id = make_ids(1),
-    document = document,
-    sentence = seq_along(content),
-    offset = offset,
+  docs <- data.frame(
+    id = id,
+    unit = "document",
+    score = doc_score,
+    magnitude = doc_magnitude,
+    position = NA_integer_,
+    offset = NA_integer_,
+    content = text,
+    stringsAsFactors = FALSE
+  )
+  sents <- data.frame(
+    id = id,
+    unit = "sentence",
     score = score,
+    magnitude = magnitude,
+    position = seq_along(content),
+    offset = offset,
     content = content,
     stringsAsFactors = FALSE
   )
+  rbind(docs, sents)
 }
 
 #' @export
@@ -84,14 +122,6 @@ as_tibble.sentiment_analysis <- function(data) {
   tibble::as_tibble(data, validate = FALSE)
 }
 
-parse_docs <- function(x, simplify = FALSE) {
-  if (simplify) {
-    jsonlite::fromJSON(
-      httr::content(x, as = "text", encoding = "UTF-8"))
-  } else {
-    httr::content(x)
-  }
-}
 
 jsonify_text <- function(text) {
   lst <- list(

diff --git a/R/utils.R b/R/utils.R
@@ -1,5 +1,4 @@
-any_recursive <- function(x) any(vapply(x, is.recursive, logical(1)))
-
+## get variable(s) from nested obj
 get_var <- function(x, ...) {
   vars <- c(...)
   success <- FALSE
@@ -25,7 +24,11 @@ get_var <- function(x, ...) {
   unlist(x)
 }
 
+any_recursive <- function(x) {
+  any(vapply(x, is.recursive, logical(1)))
+}
 
+## hase name(s) accepts one or more names (looks for all == TRUE)
 has_name <- function(x, ...) {
   vars <- c(...)
   stopifnot(is.character(vars))
@@ -35,18 +38,24 @@ has_name <- function(x, ...) {
   all(vars %in% names(x))
 }
 
-
+## question and answer (choices) for interactive sessions
 menuline <- function(q, a) {
   message(q)
   menu(a)
 }
 
+## accept line broken chr vector and remove user provided quotes
+## for interactive sessions
 readline_ <- function(...) {
   input <- readline(paste(c(...), collapse = ""))
-  gsub("^\"|\"$", "", input)
+  gsub("^\"|^\'|\"$|\'$", "", input)
 }
 
+## make sure last line of R environment file has been filled
 check_renv <- function(path) {
+  if (!file.exists(path)) {
+    return(invisible())
+  }
   con <- file(path)
   x <- readLines(con, warn = FALSE)
   close(con)
@@ -55,15 +64,27 @@ check_renv <- function(path) {
   invisible()
 }
 
-make_ids <- function(n) {
-  f <- function() {
-    ids <- sample(c(letters, toupper(letters), 0:9, 0:9), 8, replace = TRUE)
-    paste(ids, collapse = "")
+## parse method defaults to parsing individual docs
+parse_docs <- function(x, simplify = FALSE) {
+  if (simplify) {
+    jsonlite::fromJSON(
+      httr::content(x, as = "text", encoding = "UTF-8"))
+  } else {
+    httr::content(x)
   }
-  ids <- unique(unlist(replicate(n, f(), simplify = FALSE)))
-  if (length(ids) < n) {
-    ids <- unique(unlist(replicate(f, n + 5L, simplify = FALSE)))
-    ids <- sample(ids, n)
-  }
-  ids
+}
+
+## prep text
+#' @export
+#' @noRd
+prep_text <- function(x) {
+  x <- gsub(
+    "@\\S{1,}|http\\S{1,}|\\n", "", x
+  )
+  x <- gsub(
+    "^\\.\\S{0,}", "", x
+  )
+  trimws(gsub(
+    "[ ]{2,}", " ", x
+  ))
 }
diff --git a/R/zzz.R b/R/zzz.R
@@ -0,0 +1,12 @@
+make_ids <- function(n) {
+  f <- function() {
+    ids <- sample(c(letters, toupper(letters), 0:9, 0:9), 8, replace = TRUE)
+    paste(ids, collapse = "")
+  }
+  ids <- unique(unlist(replicate(n, f(), simplify = FALSE)))
+  if (length(ids) < n) {
+    ids <- unique(unlist(replicate(f, n + 5L, simplify = FALSE)))
+    ids <- sample(ids, n)
+  }
+  ids
+}
diff --git a/README.Rmd b/README.Rmd
@@ -150,46 +150,40 @@ sa <- tibble::as_tibble(sa)
 sa
 ```
 
-Each row in the converted data frame represents one sentence of the provided text. For each observation, there are six features (variables):
+Each row in the converted data frame represents one sentence of the provided text. For each observation, there are seven features (variables):
 
-- `id` Randomly generated ID (assigned to each tweet)
-- `document` Overall sentiment score of the document---in this case, a document is one whole tweet
-- `sentence` The ordinal position of a given sentence as a sequence within a single tweet (e.g., first sentence of a document, second sentence of a document)
-- `offset` The position, in number of characters, from which the sentence started within a document
-- `score` The sentiment (along positive and negative dimensions) score of the sentence
-- `content` The text of the analyzed sentence
+- `id` ID assigned to each document (in this case, each tweet)
+- `unit` Unit of analysis, either "document" or "sentence"
+- `score` The sentiment (along positive and negative dimensions) score of the sentence standardized on a -1.0 to 1.0 scale.
+- `magnitude` The magnitude of the score (positive, unstandardized)
+- `position` The ordinal position of a given sentence as a sequence within a single tweet (e.g., first sentence of a document, second sentence of a document)
+- `offset` The relative position, in number of characters, from which the sentence started within a document
+- `content` The text that was analyzed (corresponds with unit)
 
 Explore the data using the tidyverse.
 
 ```{r, eval=TRUE}
 ## i subscribe to the tidyverse
-library(tidyverse)
+suppressPackageStartupMessages(library(tidyverse))
 ```
 
-Histogram of sentences scores
+Histogram of scores faceted by unit
 
 ```{r, eval=TRUE}
 sa %>%
-  ggplot(aes(score)) +
-  geom_histogram(binwidth = .1)
-```
-
-Histogram of document scores
-
-```{r, eval=TRUE}
-sa %>%
-  filter(sentence == 1L) %>%
-  ggplot(aes(document)) +
-  geom_histogram(binwidth = .1)
+  ggplot(aes(score, fill = unit)) +
+  geom_histogram(binwidth = .1) + 
+  facet_wrap(~ unit)
 ```
 
 Box plot for each sentence position number
 
 ```{r}
 p <- sa %>%
-  mutate(sentence = factor(sentence)) %>%
+  filter(unit == "sentence") %>%
+  mutate(position = factor(position)) %>%
   ggplot(
-    aes(x = sentence, y = score, colour = sentence, fill = sentence)
+    aes(x = position, y = score, colour = position, fill = position)
   ) +
   geom_boxplot(outlier.shape = NA, alpha = .7) + 
   geom_jitter(alpha = .4, shape = 21) +