Skip to content

Commit

Permalink
tidied data parsing and updated readme
Browse files Browse the repository at this point in the history
  • Loading branch information
Kearney committed Aug 30, 2017
1 parent 364b954 commit 7866832
Show file tree
Hide file tree
Showing 10 changed files with 155 additions and 116 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ export(as.tibble.sentiment_analysis_list)
export(as_tibble.sentiment_analysis)
export(as_tibble.sentiment_analysis_list)
export(googleapis_token)
export(prep_text)
80 changes: 55 additions & 25 deletions R/analyze_sentiment.R
Original file line number Diff line number Diff line change
@@ -1,30 +1,45 @@

#' analyze_sentiment
#'
#' Returns sentiment analyzis from Google cloud language API.
#' Conducts and returns results of sentiment analysis using Google Cloud Natural
#' Language API.
#'
#' @param data Vector of plain text to analyze.
#' @param text Vector of plain text to analyze.
#' @return List of parsed response objects.
#' @export
#' @aliases analyse_sentiment
analyze_sentiment <- function(data) {
eval(call("analyze_sentiment_", data))
analyze_sentiment <- function(text, id = NULL) {
eval(call("analyze_sentiment_", text))
}

analyze_sentiment_ <- function(text) {
analyze_sentiment_ <- function(text, id = NULL) {
analyze_sentiment_internal <- function(text) {
## API path
path <- "analyzeSentiment"
## format text for request
text <- jsonify_text(text)
jstext <- jsonify_text(text)
## execute request
r <- httr::POST(api_call(path), body = text)
r <- httr::POST(api_call(path), body = jstext)
## parse
r <- parse_docs(r)
class(r) <- c("sentiment_analysis", "list")
r
structure(
.Data = r,
class = c("sentiment_analysis", "list"),
text = text,
id = id
)
##class(r) <- c("sentiment_analysis", "list")
##r
}
if (!is.null(id)) {
stopifnot(length(text) == length(id))
out <- Map(analyze_sentiment_internal, text, id)
} else {
out <- Map(analyze_sentiment_internal, text)
for (i in seq_along(out)) {
attr(out[[i]], "id") <- i
}
}
out <- lapply(text, analyze_sentiment_internal)
class(out) <- c("sentiment_analysis_list", "list")
out
}
Expand All @@ -41,24 +56,47 @@ as.data.frame.sentiment_analysis <- function(x) {
if (!has_name(x, "documentSentiment", "sentences")) {
return(data.frame())
}
document <- x$documentSentiment[["score"]]
doc_score <- x$documentSentiment[["score"]]
doc_magnitude <- x$documentSentiment[["magnitude"]]
content <- get_var(x$sentences, "text", "content")
offset <- get_var(x$sentences, "text", "beginOffset")
score <- get_var(x$sentences, "sentiment", "score")
if (!all.equal(length(content), length(offset), length(score))) {
magnitude <- get_var(x$sentences, "sentiment", "score")
id <- attr(x, "id")
text <- attr(x, "text")
lns <- c(
length(content),
length(offset),
length(score),
length(magnitude)
)
if (!all.equal(lns[1], lns[2], lns[3], lns[4])) {
content <- content[1]
offset <- offset[1]
score <- score[1]
magnitude <- magnitude[1]
}
data.frame(
id = make_ids(1),
document = document,
sentence = seq_along(content),
offset = offset,
docs <- data.frame(
id = id,
unit = "document",
score = doc_score,
magnitude = doc_magnitude,
position = NA_integer_,
offset = NA_integer_,
content = text,
stringsAsFactors = FALSE
)
sents <- data.frame(
id = id,
unit = "sentence",
score = score,
magnitude = magnitude,
position = seq_along(content),
offset = offset,
content = content,
stringsAsFactors = FALSE
)
rbind(docs, sents)
}

#' @export
Expand All @@ -84,14 +122,6 @@ as_tibble.sentiment_analysis <- function(data) {
tibble::as_tibble(data, validate = FALSE)
}

parse_docs <- function(x, simplify = FALSE) {
if (simplify) {
jsonlite::fromJSON(
httr::content(x, as = "text", encoding = "UTF-8"))
} else {
httr::content(x)
}
}

jsonify_text <- function(text) {
lst <- list(
Expand Down
49 changes: 35 additions & 14 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
any_recursive <- function(x) any(vapply(x, is.recursive, logical(1)))

## get variable(s) from nested obj
get_var <- function(x, ...) {
vars <- c(...)
success <- FALSE
Expand All @@ -25,7 +24,11 @@ get_var <- function(x, ...) {
unlist(x)
}

any_recursive <- function(x) {
any(vapply(x, is.recursive, logical(1)))
}

## hase name(s) accepts one or more names (looks for all == TRUE)
has_name <- function(x, ...) {
vars <- c(...)
stopifnot(is.character(vars))
Expand All @@ -35,18 +38,24 @@ has_name <- function(x, ...) {
all(vars %in% names(x))
}


## question and answer (choices) for interactive sessions
menuline <- function(q, a) {
message(q)
menu(a)
}

## accept line broken chr vector and remove user provided quotes
## for interactive sessions
readline_ <- function(...) {
input <- readline(paste(c(...), collapse = ""))
gsub("^\"|\"$", "", input)
gsub("^\"|^\'|\"$|\'$", "", input)
}

## make sure last line of R environment file has been filled
check_renv <- function(path) {
if (!file.exists(path)) {
return(invisible())
}
con <- file(path)
x <- readLines(con, warn = FALSE)
close(con)
Expand All @@ -55,15 +64,27 @@ check_renv <- function(path) {
invisible()
}

make_ids <- function(n) {
f <- function() {
ids <- sample(c(letters, toupper(letters), 0:9, 0:9), 8, replace = TRUE)
paste(ids, collapse = "")
## parse method defaults to parsing individual docs
parse_docs <- function(x, simplify = FALSE) {
if (simplify) {
jsonlite::fromJSON(
httr::content(x, as = "text", encoding = "UTF-8"))
} else {
httr::content(x)
}
ids <- unique(unlist(replicate(n, f(), simplify = FALSE)))
if (length(ids) < n) {
ids <- unique(unlist(replicate(f, n + 5L, simplify = FALSE)))
ids <- sample(ids, n)
}
ids
}

## prep text
#' @export
#' @noRd
prep_text <- function(x) {
x <- gsub(
"@\\S{1,}|http\\S{1,}|\\n", "", x
)
x <- gsub(
"^\\.\\S{0,}", "", x
)
trimws(gsub(
"[ ]{2,}", " ", x
))
}
12 changes: 12 additions & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
make_ids <- function(n) {
f <- function() {
ids <- sample(c(letters, toupper(letters), 0:9, 0:9), 8, replace = TRUE)
paste(ids, collapse = "")
}
ids <- unique(unlist(replicate(n, f(), simplify = FALSE)))
if (length(ids) < n) {
ids <- unique(unlist(replicate(f, n + 5L, simplify = FALSE)))
ids <- sample(ids, n)
}
ids
}
38 changes: 16 additions & 22 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -150,46 +150,40 @@ sa <- tibble::as_tibble(sa)
sa
```

Each row in the converted data frame represents one sentence of the provided text. For each observation, there are six features (variables):
Each row in the converted data frame represents one sentence of the provided text. For each observation, there are seven features (variables):

- `id` Randomly generated ID (assigned to each tweet)
- `document` Overall sentiment score of the document---in this case, a document is one whole tweet
- `sentence` The ordinal position of a given sentence as a sequence within a single tweet (e.g., first sentence of a document, second sentence of a document)
- `offset` The position, in number of characters, from which the sentence started within a document
- `score` The sentiment (along positive and negative dimensions) score of the sentence
- `content` The text of the analyzed sentence
- `id` ID assigned to each document (in this case, each tweet)
- `unit` Unit of analysis, either "document" or "sentence"
- `score` The sentiment (along positive and negative dimensions) score of the sentence standardized on a -1.0 to 1.0 scale.
- `magnitude` The magnitude of the score (positive, unstandardized)
- `position` The ordinal position of a given sentence as a sequence within a single tweet (e.g., first sentence of a document, second sentence of a document)
- `offset` The relative position, in number of characters, from which the sentence started within a document
- `content` The text that was analyzed (corresponds with unit)

Explore the data using the tidyverse.

```{r, eval=TRUE}
## i subscribe to the tidyverse
library(tidyverse)
suppressPackageStartupMessages(library(tidyverse))
```

Histogram of sentences scores
Histogram of scores faceted by unit

```{r, eval=TRUE}
sa %>%
ggplot(aes(score)) +
geom_histogram(binwidth = .1)
```

Histogram of document scores

```{r, eval=TRUE}
sa %>%
filter(sentence == 1L) %>%
ggplot(aes(document)) +
geom_histogram(binwidth = .1)
ggplot(aes(score, fill = unit)) +
geom_histogram(binwidth = .1) +
facet_wrap(~ unit)
```

Box plot for each sentence position number

```{r}
p <- sa %>%
mutate(sentence = factor(sentence)) %>%
filter(unit == "sentence") %>%
mutate(position = factor(position)) %>%
ggplot(
aes(x = sentence, y = score, colour = sentence, fill = sentence)
aes(x = position, y = score, colour = position, fill = position)
) +
geom_boxplot(outlier.shape = NA, alpha = .7) +
geom_jitter(alpha = .4, shape = 21) +
Expand Down
Loading

0 comments on commit 7866832

Please sign in to comment.