Skip to content

Commit

Permalink
use tokens instead of corpus objects for get_ wrapper functions
Browse files Browse the repository at this point in the history
This allows user to pre-process text as desired using quanteda before running get_ functions.
  • Loading branch information
prodriguezsosa committed Oct 19, 2021
1 parent 8dfaeed commit e3c089e
Show file tree
Hide file tree
Showing 11 changed files with 171 additions and 161 deletions.
Binary file modified .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion R/classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ setClass("dem",
docvars = "data.frame",
features = "character"),
prototype = list(Dim = integer(2),
Dimnames = list(rows = character(), columns = NULL),
Dimnames = list(docs = character(), columns = NULL),
docvars = data.frame(row.names = integer()),
features = character()),
contains = "dgCMatrix")
Expand Down
61 changes: 31 additions & 30 deletions R/get_cos_sim.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Given a corpus and a set of features, calculate cosine similarities over
#' a grouping variable.
#'
#' @param x a (quanteda) corpus or character vector
#' @param x a (quanteda) tokens object
#' @param groups a grouping variable
#' @param features (character) features of interest
#' @inheritParams dem
Expand All @@ -11,8 +11,6 @@
#' if groups defined, sampling is automatically stratified; top nns are those with
#' the highest average over all bootstrap samples.
#' @param num_bootstraps (integer) number of bootstraps to use
#' @param what character; which quanteda tokenizer to use. You will rarely want to change this.
#' For Chinese texts you may want to set what = 'fastestword'.
#'
#' @return a `data.frame` or list of data.frames (one for each target)
#' with the following columns:
Expand All @@ -36,8 +34,10 @@
#' immig_corpus <- corpus_context(x = cr_sample_corpus,
#' pattern = "immigration", window = 6L, verbose = TRUE)
#'
#' get_cos_sim(x = immig_corpus,
#' groups = docvars(immig_corpus, 'party'),
#' immig_toks <- tokens(immig_corpus)
#'
#' get_cos_sim(x = immig_toks,
#' groups = docvars(immig_toks, "party"),
#' features = c("reform", "enforce"),
#' pre_trained = glove_subset,
#' transform = TRUE,
Expand All @@ -54,22 +54,20 @@ get_cos_sim <- function(x,
transform_matrix,
bootstrap = TRUE,
num_bootstraps = 10,
what = 'word',
as_list = TRUE,
verbose = TRUE) {

# create a new corpus
x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups))
# add grouping variable to docvars
if(!is.null(groups)) docvars(x) <- NULL; docvars(x, "group") <- groups

if(bootstrap){
cossimdf_bs <- replicate(num_bootstraps,
cos_sim_boostrap(x = x,
groups = quanteda::docvars(x, 'group'),
groups = groups,
features = features,
pre_trained = pre_trained,
transform = transform,
transform_matrix = transform_matrix,
what = what,
as_list = FALSE),
simplify = FALSE)
result <- do.call(rbind, cossimdf_bs) %>%
Expand All @@ -81,23 +79,21 @@ get_cos_sim <- function(x,
dplyr::select('target', 'feature', 'value', 'std.error')
}else{

# create a new corpus
x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups))

# tokenize texts
corpus_toks <- quanteda::tokens(x, what = what)

# create document-feature matrix
corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
x_dfm <- quanteda::dfm(x, tolower = FALSE)

# compute document-embedding matrix
corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform_matrix = transform_matrix, transform = transform, verbose = verbose)
x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)

# aggregate dems by group var
if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
if(!is.null(groups)){
wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
} else {
wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem))
}

# compute cosine similarity
result <- cos_sim(x = corpus_dem, pre_trained = pre_trained, features = features)
result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features)
}

# if !as_list return a list object with an item for each feature data.frame
Expand All @@ -113,25 +109,30 @@ cos_sim_boostrap <- function(x,
pre_trained,
transform = TRUE,
transform_matrix,
what = what,
as_list = FALSE){
# create a new corpus
x <- quanteda::corpus_sample(x, size = quanteda::ndoc(x), replace = TRUE, by = groups)

# tokenize texts
corpus_toks <- quanteda::tokens(x, what = what)
# sample tokens with replacement
if(!is.null(groups)) {
x <- quanteda::tokens_sample(x = x, size = table(groups), replace = TRUE, by = groups)
} else {
x <- quanteda::tokens_sample(x = x, size = quanteda::ndoc(x), replace = TRUE)
}

# create document-feature matrix
corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
x_dfm <- quanteda::dfm(x, tolower = FALSE)

# compute document-embedding matrix
corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)

# aggregate dems by group var
if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
# aggregate dems by group var if defined
if(!is.null(groups)){
wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
} else {
wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem))
}

# compute cosine similarity
result <- cos_sim(x = corpus_dem, pre_trained = pre_trained, features = features, as_list = FALSE)
result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, as_list = FALSE)

return(result)

Expand Down
60 changes: 32 additions & 28 deletions R/get_nns.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Given a corpus and a set of candidate neighbors, find the top N nearest
#' neighbors.
#'
#' @param x a (quanteda) corpus or character vector
#' @param x a (quanteda) tokens object
#' @inheritParams nns
#' @param groups grouping variable equal in length to the number of documents
#' @inheritParams dem
Expand All @@ -10,8 +10,6 @@
#' if groups defined, sampling is automatically stratified; top nns are those with
#' the highest average over all bootstrap samples.
#' @param num_bootstraps (integer) number of bootstraps to use
#' @param what character; which quanteda tokenizer to use. You will rarely want to change this.
#' For Chinese texts you may want to set what = 'fastestword'.
#'
#' @return a `data.frame` or list of data.frames (one for each target)
#' with the following columns:
Expand All @@ -34,8 +32,10 @@
#' immig_corpus <- corpus_context(x = cr_sample_corpus,
#' pattern = "immigration", window = 6L, verbose = TRUE)
#'
#' get_nns(x = immig_corpus, N = 10,
#' groups = docvars(immig_corpus, 'party'),
#' immig_toks <- tokens(immig_corpus)
#'
#' get_nns(x = immig_toks, N = 10,
#' groups = docvars(immig_toks, "party"),
#' candidates = character(0),
#' pre_trained = glove_subset,
#' transform = TRUE,
Expand All @@ -52,21 +52,20 @@ get_nns <- function(x,
transform_matrix,
bootstrap = TRUE,
num_bootstraps = 10,
what = 'word',
as_list = TRUE) {

# create a new corpus
x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups))
# add grouping variable to docvars
if(!is.null(groups)) docvars(x) <- NULL; docvars(x, "group") <- groups

# if bootstrap
if(bootstrap){
nnsdf_bs <- replicate(num_bootstraps,
nns_boostrap(x = x,
groups = quanteda::docvars(x, 'group'),
groups = groups,
candidates = candidates,
pre_trained = pre_trained,
transform = transform,
transform_matrix = transform_matrix,
what = what,
as_list = FALSE),
simplify = FALSE)
result <- do.call(rbind, nnsdf_bs) %>%
Expand All @@ -81,20 +80,21 @@ get_nns <- function(x,
dplyr::select('target', 'feature', 'rank', 'value', 'std.error')
}else{

# tokenize texts
corpus_toks <- quanteda::tokens(x, what = what)

# create document-feature matrix
corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
x_dfm <- quanteda::dfm(x, tolower = FALSE)

# compute document-embedding matrix
corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)

# aggregate dems by group var
if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
if(!is.null(groups)){
wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
} else {
wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem))
}

# find nearest neighbors
result <- nns(x = corpus_dem, N = N, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
result <- nns(x = wvs, N = N, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
}

# if !as_list return a list object with an item for each target data.frame
Expand All @@ -111,26 +111,30 @@ nns_boostrap <- function(x,
pre_trained,
transform = TRUE,
transform_matrix,
what = what,
as_list = FALSE){

# create a new corpus
x <- quanteda::corpus_sample(x, size = quanteda::ndoc(x), replace = TRUE, by = groups)

# tokenize texts
corpus_toks <- quanteda::tokens(x, what = what)
# sample tokens with replacement
if(!is.null(groups)) {
x <- quanteda::tokens_sample(x = x, size = table(groups), replace = TRUE, by = groups)
} else {
x <- quanteda::tokens_sample(x = x, size = quanteda::ndoc(x), replace = TRUE)
}

# create document-feature matrix
corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
x_dfm <- quanteda::dfm(x, tolower = FALSE)

# compute document-embedding matrix
corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)

# aggregate dems by group var
if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
# aggregate dems by group var if defined
if(!is.null(groups)){
wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
} else {
wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem))
}

# find nearest neighbors
result <- nns(x = corpus_dem, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
result <- nns(x = wvs, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)

return(result)

Expand Down
Loading

0 comments on commit e3c089e

Please sign in to comment.