diff --git a/.DS_Store b/.DS_Store index ebc5503..0632c31 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/R/classes.R b/R/classes.R index a9520d2..bdb4bdf 100644 --- a/R/classes.R +++ b/R/classes.R @@ -13,7 +13,7 @@ setClass("dem", docvars = "data.frame", features = "character"), prototype = list(Dim = integer(2), - Dimnames = list(rows = character(), columns = NULL), + Dimnames = list(docs = character(), columns = NULL), docvars = data.frame(row.names = integer()), features = character()), contains = "dgCMatrix") diff --git a/R/get_cos_sim.R b/R/get_cos_sim.R index 5dfef73..8a1d75e 100644 --- a/R/get_cos_sim.R +++ b/R/get_cos_sim.R @@ -1,7 +1,7 @@ #' Given a corpus and a set of features, calculate cosine similarities over #' a grouping variable. #' -#' @param x a (quanteda) corpus or character vector +#' @param x a (quanteda) tokens object #' @param groups a grouping variable #' @param features (character) features of interest #' @inheritParams dem @@ -11,8 +11,6 @@ #' if groups defined, sampling is automatically stratified; top nns are those with #' the highest average over all bootstrap samples. #' @param num_bootstraps (integer) number of bootstraps to use -#' @param what character; which quanteda tokenizer to use. You will rarely want to change this. -#' For Chinese texts you may want to set what = 'fastestword'. #' #' @return a `data.frame` or list of data.frames (one for each target) #' with the following columns: @@ -36,8 +34,10 @@ #' immig_corpus <- corpus_context(x = cr_sample_corpus, #' pattern = "immigration", window = 6L, verbose = TRUE) #' -#' get_cos_sim(x = immig_corpus, -#' groups = docvars(immig_corpus, 'party'), +#' immig_toks <- tokens(immig_corpus) +#' +#' get_cos_sim(x = immig_toks, +#' groups = docvars(immig_toks, "party"), #' features = c("reform", "enforce"), #' pre_trained = glove_subset, #' transform = TRUE, @@ -54,22 +54,20 @@ get_cos_sim <- function(x, transform_matrix, bootstrap = TRUE, num_bootstraps = 10, - what = 'word', as_list = TRUE, verbose = TRUE) { - # create a new corpus - x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups)) + # add grouping variable to docvars + if(!is.null(groups)) docvars(x) <- NULL; docvars(x, "group") <- groups if(bootstrap){ cossimdf_bs <- replicate(num_bootstraps, cos_sim_boostrap(x = x, - groups = quanteda::docvars(x, 'group'), + groups = groups, features = features, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, - what = what, as_list = FALSE), simplify = FALSE) result <- do.call(rbind, cossimdf_bs) %>% @@ -81,23 +79,21 @@ get_cos_sim <- function(x, dplyr::select('target', 'feature', 'value', 'std.error') }else{ - # create a new corpus - x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups)) - - # tokenize texts - corpus_toks <- quanteda::tokens(x, what = what) - # create document-feature matrix - corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE) + x_dfm <- quanteda::dfm(x, tolower = FALSE) # compute document-embedding matrix - corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform_matrix = transform_matrix, transform = transform, verbose = verbose) + x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) # aggregate dems by group var - if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group) + if(!is.null(groups)){ + wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group) + } else { + wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem)) + } # compute cosine similarity - result <- cos_sim(x = corpus_dem, pre_trained = pre_trained, features = features) + result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features) } # if !as_list return a list object with an item for each feature data.frame @@ -113,25 +109,30 @@ cos_sim_boostrap <- function(x, pre_trained, transform = TRUE, transform_matrix, - what = what, as_list = FALSE){ - # create a new corpus - x <- quanteda::corpus_sample(x, size = quanteda::ndoc(x), replace = TRUE, by = groups) - # tokenize texts - corpus_toks <- quanteda::tokens(x, what = what) + # sample tokens with replacement + if(!is.null(groups)) { + x <- quanteda::tokens_sample(x = x, size = table(groups), replace = TRUE, by = groups) + } else { + x <- quanteda::tokens_sample(x = x, size = quanteda::ndoc(x), replace = TRUE) + } # create document-feature matrix - corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE) + x_dfm <- quanteda::dfm(x, tolower = FALSE) # compute document-embedding matrix - corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) + x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) - # aggregate dems by group var - if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group) + # aggregate dems by group var if defined + if(!is.null(groups)){ + wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group) + } else { + wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem)) + } # compute cosine similarity - result <- cos_sim(x = corpus_dem, pre_trained = pre_trained, features = features, as_list = FALSE) + result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, as_list = FALSE) return(result) diff --git a/R/get_nns.R b/R/get_nns.R index 71b68f6..65d2a43 100644 --- a/R/get_nns.R +++ b/R/get_nns.R @@ -1,7 +1,7 @@ #' Given a corpus and a set of candidate neighbors, find the top N nearest #' neighbors. #' -#' @param x a (quanteda) corpus or character vector +#' @param x a (quanteda) tokens object #' @inheritParams nns #' @param groups grouping variable equal in length to the number of documents #' @inheritParams dem @@ -10,8 +10,6 @@ #' if groups defined, sampling is automatically stratified; top nns are those with #' the highest average over all bootstrap samples. #' @param num_bootstraps (integer) number of bootstraps to use -#' @param what character; which quanteda tokenizer to use. You will rarely want to change this. -#' For Chinese texts you may want to set what = 'fastestword'. #' #' @return a `data.frame` or list of data.frames (one for each target) #' with the following columns: @@ -34,8 +32,10 @@ #' immig_corpus <- corpus_context(x = cr_sample_corpus, #' pattern = "immigration", window = 6L, verbose = TRUE) #' -#' get_nns(x = immig_corpus, N = 10, -#' groups = docvars(immig_corpus, 'party'), +#' immig_toks <- tokens(immig_corpus) +#' +#' get_nns(x = immig_toks, N = 10, +#' groups = docvars(immig_toks, "party"), #' candidates = character(0), #' pre_trained = glove_subset, #' transform = TRUE, @@ -52,21 +52,20 @@ get_nns <- function(x, transform_matrix, bootstrap = TRUE, num_bootstraps = 10, - what = 'word', as_list = TRUE) { - # create a new corpus - x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups)) + # add grouping variable to docvars + if(!is.null(groups)) docvars(x) <- NULL; docvars(x, "group") <- groups + # if bootstrap if(bootstrap){ nnsdf_bs <- replicate(num_bootstraps, nns_boostrap(x = x, - groups = quanteda::docvars(x, 'group'), + groups = groups, candidates = candidates, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, - what = what, as_list = FALSE), simplify = FALSE) result <- do.call(rbind, nnsdf_bs) %>% @@ -81,20 +80,21 @@ get_nns <- function(x, dplyr::select('target', 'feature', 'rank', 'value', 'std.error') }else{ - # tokenize texts - corpus_toks <- quanteda::tokens(x, what = what) - # create document-feature matrix - corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE) + x_dfm <- quanteda::dfm(x, tolower = FALSE) # compute document-embedding matrix - corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) + x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) # aggregate dems by group var - if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group) + if(!is.null(groups)){ + wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group) + } else { + wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem)) + } # find nearest neighbors - result <- nns(x = corpus_dem, N = N, candidates = candidates, pre_trained = pre_trained, as_list = FALSE) + result <- nns(x = wvs, N = N, candidates = candidates, pre_trained = pre_trained, as_list = FALSE) } # if !as_list return a list object with an item for each target data.frame @@ -111,26 +111,30 @@ nns_boostrap <- function(x, pre_trained, transform = TRUE, transform_matrix, - what = what, as_list = FALSE){ - # create a new corpus - x <- quanteda::corpus_sample(x, size = quanteda::ndoc(x), replace = TRUE, by = groups) - - # tokenize texts - corpus_toks <- quanteda::tokens(x, what = what) + # sample tokens with replacement + if(!is.null(groups)) { + x <- quanteda::tokens_sample(x = x, size = table(groups), replace = TRUE, by = groups) + } else { + x <- quanteda::tokens_sample(x = x, size = quanteda::ndoc(x), replace = TRUE) + } # create document-feature matrix - corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE) + x_dfm <- quanteda::dfm(x, tolower = FALSE) # compute document-embedding matrix - corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) + x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) - # aggregate dems by group var - if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group) + # aggregate dems by group var if defined + if(!is.null(groups)){ + wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group) + } else { + wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem)) + } # find nearest neighbors - result <- nns(x = corpus_dem, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE) + result <- nns(x = wvs, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE) return(result) diff --git a/R/get_nns_ratio.R b/R/get_nns_ratio.R index 864ef8f..be27b51 100644 --- a/R/get_nns_ratio.R +++ b/R/get_nns_ratio.R @@ -1,20 +1,19 @@ #' Given a corpus and a binary grouping variable, computes the ratio of cosine similarities #' over the union of their respective N nearest neighbors. #' -#' @param x a (quanteda) corpus or character vector +#' @param x a (quanteda) tokens object #' @inheritParams nns_ratio #' @inheritParams dem #' @inheritParams dem #' @inheritParams dem_group -#' @param numerator (character) +#' @param numerator (character) defines which group is the nuemerator in the ratio. #' @param bootstrap (logical) if TRUE, bootstrap nns - sample from corpus with replacement; #' if groups defined, sampling is automatically stratified; top nns are those with #' the highest average over all bootstrap samples. #' @param num_bootstraps (integer) number of bootstraps to use #' @param permute (logical) if TRUE, compute empirical p-values using permutation test #' @param num_permutations (numeric) number of permutations to use -#' @param what character; which quanteda tokenizer to use. You will rarely want to change this. -#' For Chinese texts you may want to set what = 'fastestword'. +#' @param verbose provide information on which group is the numerator #' #' @return a `data.frame` with following columns: #' \item{`feature`}{(character) vector of features from the candidate set, @@ -39,12 +38,17 @@ #' window = 6L, #' verbose = TRUE) #' +#' # identify features in local vocab that overlap with pre-trained embeddings +#' # these are used to define candidate nearest neighbors below (optional) #' local_vocab <- get_local_vocab(as.character(immig_corpus), pre_trained = glove_subset) #' +#' # tokenize +#' immig_toks <- tokens(immig_corpus) +#' #' set.seed(42L) -#' get_nns_ratio(x = immig_corpus, +#' temp <- get_nns_ratio(x = immig_toks, #' N = 20, -#' groups = docvars(immig_corpus, 'party'), +#' groups = docvars(immig_toks, 'party'), #' numerator = "R", #' candidates = local_vocab, #' pre_trained = glove_subset, @@ -53,11 +57,11 @@ #' bootstrap = TRUE, #' num_bootstraps = 10, #' permute = TRUE, -#' num_permutations = 10) -#' +#' num_permutations = 10, +#' verbose = TRUE) get_nns_ratio <- function(x, N = 10, - groups = NULL, + groups, numerator = NULL, candidates = character(0), pre_trained, @@ -67,38 +71,32 @@ get_nns_ratio <- function(x, num_bootstraps = 10, permute = TRUE, num_permutations = 100, - what = 'word'){ + verbose = TRUE){ # checks - group_order <- unique(groups) - if(is.null(group_order)) stop("a binary grouping variable must be provided") - if(length(group_order)!=2) stop("groups must be binary") + group_vars <- unique(groups) + if(is.null(group_vars) | length(group_vars)!=2) stop("a binary grouping variable must be provided") if(!is.null(numerator)){ - if(!(numerator %in% group_order)){ - stop("numerator must refer to one of the two groups in the groups argument")}else{ - group_order <- c(numerator, setdiff(group_order, numerator))} + if(!(numerator %in% group_vars)) stop("numerator must refer to one of the two groups in the groups argument") } + denominator <- setdiff(group_vars, numerator) - # create a new corpus - x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups)) - - # tokenize texts - corpus_toks <- quanteda::tokens(x, what = what) + # add grouping variable to docvars + if(!is.null(groups)) docvars(x) <- NULL; docvars(x, "group") <- groups # create document-feature matrix - corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE) + x_dfm <- quanteda::dfm(x, tolower = FALSE) # compute document-embedding matrix - corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) + x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) # aggregate dems by group var - corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group) - corpus_dem <- corpus_dem[group_order,] # re-arrange groups + wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group) # get top N nns (if N is Inf or NULL, use all features) - nnsdfs <- nns(x = corpus_dem, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = TRUE) - nnsdf1 <- if(is.null(N)) nnsdfs[[1]]$feature else nnsdfs[[1]]$feature[1:N] - nnsdf2 <- if(is.null(N)) nnsdfs[[2]]$feature else nnsdfs[[2]]$feature[1:N] + nnsdfs <- nns(x = wvs, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = TRUE) + nnsdf1 <- if(is.null(N)) nnsdfs[[numerator]]$feature else nnsdfs[[numerator]]$feature[1:N] + nnsdf2 <- if(is.null(N)) nnsdfs[[denominator]]$feature else nnsdfs[[denominator]]$feature[1:N] # get union of top N nns union_nns <- union(nnsdf1, nnsdf2) @@ -106,7 +104,7 @@ get_nns_ratio <- function(x, if(!bootstrap){ # find nearest neighbors ratio - result <- nns_ratio(x = corpus_dem, N = N, candidates = union_nns, pre_trained = pre_trained) + result <- nns_ratio(x = wvs, N = N, numerator = numerator, candidates = union_nns, pre_trained = pre_trained) }else{ @@ -114,13 +112,12 @@ get_nns_ratio <- function(x, # bootstrap ratio nnsratiodf_bs <- replicate(num_bootstraps, nns_ratio_boostrap(x = x, - groups = quanteda::docvars(x, 'group'), - group_order = group_order, + groups = groups, + numerator = numerator, candidates = union_nns, pre_trained = pre_trained, transform = transform, - transform_matrix = transform_matrix, - what = what), + transform_matrix = transform_matrix), simplify = FALSE) result <- do.call(rbind, nnsratiodf_bs) %>% dplyr::group_by(feature) %>% @@ -141,13 +138,12 @@ get_nns_ratio <- function(x, # permute similarity cat('starting permutations \n') permute_out <- replicate(num_permutations, nns_ratio_permute(x, - groups = quanteda::docvars(x, 'group'), - group_order = group_order, + groups = groups, + numerator = numerator, candidates = union_nns, pre_trained = pre_trained, transform = transform, - transform_matrix = transform_matrix, - what = what), + transform_matrix = transform_matrix), simplify = FALSE) # compute deviations of the observed ratios from 1 @@ -163,45 +159,40 @@ get_nns_ratio <- function(x, # add information on nns result <- result %>% dplyr::mutate(group = dplyr::case_when((feature %in% nnsdf1) & (feature %in% nnsdf2) ~ "shared", - (feature %in% nnsdf1) & !(feature %in% nnsdf2) ~ group_order[1], - !(feature %in% nnsdf1) & (feature %in% nnsdf2) ~ group_order[2])) + (feature %in% nnsdf1) & !(feature %in% nnsdf2) ~ numerator, + !(feature %in% nnsdf1) & (feature %in% nnsdf2) ~ denominator)) # add an attribute specifying which group is the numerator and communicated this to user - attr(result, "numerator") <- group_order[1] - cat("NOTE: values refer to the ratio", paste0(group_order[1], "/", group_order[2], ".")) + attr(result, "numerator") <- numerator + if(verbose) cat("NOTE: values refer to the ratio", paste0(numerator, "/", denominator, ".")) return(result) } - # sub-function nns_ratio_boostrap <- function(x, groups, - group_order, + numerator = NULL, candidates = character(0), pre_trained = pre_trained, transform = TRUE, - transform_matrix = transform_matrix, - what = what){ + transform_matrix = transform_matrix){ - # create a new corpus - x <- quanteda::corpus_sample(x, size = quanteda::ndoc(x), replace = TRUE, by = groups) - # tokenize texts - corpus_toks <- quanteda::tokens(x, what = what) + # sample tokens with replacement + x <- quanteda::tokens_sample(x = x, size = table(groups), replace = TRUE, by = groups) # create document-feature matrix - corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE) + x_dfm <- quanteda::dfm(x, tolower = FALSE) # compute document-embedding matrix - corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) + x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) - # aggregate dems by group var - corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group) - corpus_dem <- corpus_dem[group_order,] # re-arrange groups + # aggregate dems by group + wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group) # find nearest neighbors - result <- nns_ratio(x = corpus_dem, N = NULL, candidates = candidates, pre_trained = pre_trained) + result <- nns_ratio(x = wvs, N = NULL, numerator = numerator, candidates = candidates, pre_trained = pre_trained, verbose = FALSE) return(result) @@ -210,31 +201,26 @@ nns_ratio_boostrap <- function(x, # runs permutations nns_ratio_permute <- function(x, groups, - group_order, + numerator = NULL, candidates = character(0), pre_trained, transform = TRUE, - transform_matrix, - what = 'word'){ + transform_matrix){ - # shuffle the texts - x <- quanteda::corpus(sample(as.character(x)), docvars = data.frame('group' = groups)) - - # tokenize texts - corpus_toks <- quanteda::tokens(x, what = what) + # shuffle tokenized texts + docvars(x, 'group') <- sample(groups) # create document-feature matrix - corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE) + x_dfm <- quanteda::dfm(x, tolower = FALSE) # compute document-embedding matrix - corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) + x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE) - # aggregate dems by group var - corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group) - corpus_dem <- corpus_dem[group_order,] # re-arrange groups + # aggregate dems by group + wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group) # find nearest neighbors - result <- nns_ratio(x = corpus_dem, N = NULL, candidates = candidates, pre_trained = pre_trained) + result <- nns_ratio(x = wvs, N = NULL, numerator = numerator, candidates = candidates, pre_trained = pre_trained, verbose = FALSE) return(result) } diff --git a/R/nns.R b/R/nns.R index 260a529..7467551 100644 --- a/R/nns.R +++ b/R/nns.R @@ -83,7 +83,8 @@ nns <- function(x, N = 10, candidates = character(0), pre_trained, as_list = TRU if(is.null(rownames(x))) result$target <- NA # if !as_list return a list object with an item for each target data.frame - if(as_list) result <- lapply(unique(result$target), function(i) result[result$target == i,] %>% dplyr::mutate(target = as.character(target))) %>% setNames(unique(result$target)) + if(as_list && !is.null(rownames(x))) result <- lapply(unique(result$target), function(i) result[result$target == i,] %>% dplyr::mutate(target = as.character(target))) %>% setNames(unique(result$target)) + if(as_list && is.null(rownames(x))) message("although as_list = TRUE, will return a single tibble as there is only one word vector") return(result) } diff --git a/R/nns_ratio.R b/R/nns_ratio.R index e4da72a..ed78a14 100644 --- a/R/nns_ratio.R +++ b/R/nns_ratio.R @@ -3,12 +3,14 @@ #' #' @param x a [dem-class] or [fem-class] object or in general a matrix of embeddings #' @param N number of nearest neighbors to consider +#' @param numerator (character) defines which group is the nuemerator in the ratio #' Note: if nearest neighbors overlap, the resulting number of features #' will be fewer than 2*N. #' @param candidates character vector delimiting the features to consider as nearest neighbor candidates #' @param pre_trained a F x D matrix of numeric values corresponding to pretrained embeddings #' F = number of features and D = embedding dimensions. #' rownames(pre_trained) = set of features for which there is a pre-trained embedding +#' @param verbose provide information on which group is the numerator #' #' @return a `data.frame` with following columns: #' \item{`feature`}{(character) vector of features from the candidate set, @@ -46,12 +48,15 @@ #' immig_dem_party <- dem_group(immig_dem, groups = immig_dem@docvars$party) #' #' # find nearest neighbors -#' nns_ratio(x = immig_dem_party, pre_trained = glove_subset, candidates = character(0), N = 10) -nns_ratio <- function(x, N = 10, candidates = character(0), pre_trained){ +#' nns_ratio(x = immig_dem_party, N = 10, numerator = "R", candidates = character(0), pre_trained = glove_subset, verbose = TRUE) +nns_ratio <- function(x, N = 10, numerator = NULL, candidates = character(0), pre_trained, verbose = TRUE){ # check if(nrow(x)!=2) stop('nns_ratio can only be applied to a pair of embeddings i.e. nrow(x) must equal 2') + # re-arrange if numerator is defined + if(!is.null(numerator) && rownames(x)[1]!=numerator) x <- x[c(rownames(x)[2],rownames(x)[1]),] + # get nns nnsdf1 <- nns(x = x[1,], N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE) nnsdf2 <- nns(x = x[2,], N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE) @@ -64,5 +69,9 @@ nns_ratio <- function(x, N = 10, candidates = character(0), pre_trained){ nnsdf2 <- nnsdf2 %>% dplyr::filter(feature %in% union_nns) %>% dplyr::arrange(feature) result <- data.frame(feature = nnsdf1$feature, value = nnsdf1$value/nnsdf2$value) %>% dplyr::arrange(-value) + # add an attribute specifying which group is the numerator and communicated this to user + attr(result, "numerator") <- rownames(x)[1] + if(verbose) cat("NOTE: values refer to the ratio", paste0(rownames(x)[1], "/", rownames(x)[2], ".")) + return(result) } diff --git a/man/get_cos_sim.Rd b/man/get_cos_sim.Rd index 225db9c..ba1e1b8 100644 --- a/man/get_cos_sim.Rd +++ b/man/get_cos_sim.Rd @@ -14,13 +14,12 @@ get_cos_sim( transform_matrix, bootstrap = TRUE, num_bootstraps = 10, - what = "word", as_list = TRUE, verbose = TRUE ) } \arguments{ -\item{x}{a (quanteda) corpus or character vector} +\item{x}{a (quanteda) tokens object} \item{groups}{a grouping variable} @@ -42,9 +41,6 @@ the highest average over all bootstrap samples.} \item{num_bootstraps}{(integer) number of bootstraps to use} -\item{what}{character; which quanteda tokenizer to use. You will rarely want to change this. -For Chinese texts you may want to set what = 'fastestword'.} - \item{as_list}{(logical) if FALSE all results are combined into a single data.frame If TRUE, a list of data.frames is returned with one data.frame per feature.} @@ -74,8 +70,10 @@ library(quanteda) immig_corpus <- corpus_context(x = cr_sample_corpus, pattern = "immigration", window = 6L, verbose = TRUE) -get_cos_sim(x = immig_corpus, -groups = docvars(immig_corpus, 'party'), +immig_toks <- tokens(immig_corpus) + +get_cos_sim(x = immig_toks, +groups = docvars(immig_toks, "party"), features = c("reform", "enforce"), pre_trained = glove_subset, transform = TRUE, diff --git a/man/get_nns.Rd b/man/get_nns.Rd index e15bdff..a70ea7e 100644 --- a/man/get_nns.Rd +++ b/man/get_nns.Rd @@ -15,12 +15,11 @@ get_nns( transform_matrix, bootstrap = TRUE, num_bootstraps = 10, - what = "word", as_list = TRUE ) } \arguments{ -\item{x}{a (quanteda) corpus or character vector} +\item{x}{a (quanteda) tokens object} \item{N}{(numeric) number of nearest neighbors to return} @@ -47,9 +46,6 @@ the highest average over all bootstrap samples.} \item{num_bootstraps}{(integer) number of bootstraps to use} -\item{what}{character; which quanteda tokenizer to use. You will rarely want to change this. -For Chinese texts you may want to set what = 'fastestword'.} - \item{as_list}{(logical) if FALSE all results are combined into a single data.frame If TRUE, a list of data.frames is returned with one data.frame per target.} } @@ -76,8 +72,10 @@ library(quanteda) immig_corpus <- corpus_context(x = cr_sample_corpus, pattern = "immigration", window = 6L, verbose = TRUE) -get_nns(x = immig_corpus, N = 10, -groups = docvars(immig_corpus, 'party'), +immig_toks <- tokens(immig_corpus) + +get_nns(x = immig_toks, N = 10, +groups = docvars(immig_toks, "party"), candidates = character(0), pre_trained = glove_subset, transform = TRUE, diff --git a/man/get_nns_ratio.Rd b/man/get_nns_ratio.Rd index a1ff55f..1234a81 100644 --- a/man/get_nns_ratio.Rd +++ b/man/get_nns_ratio.Rd @@ -8,7 +8,7 @@ over the union of their respective N nearest neighbors.} get_nns_ratio( x, N = 10, - groups = NULL, + groups, numerator = NULL, candidates = character(0), pre_trained, @@ -18,19 +18,17 @@ get_nns_ratio( num_bootstraps = 10, permute = TRUE, num_permutations = 100, - what = "word" + verbose = TRUE ) } \arguments{ -\item{x}{a (quanteda) corpus or character vector} +\item{x}{a (quanteda) tokens object} -\item{N}{number of nearest neighbors to consider -Note: if nearest neighbors overlap, the resulting number of features -will be fewer than 2*N.} +\item{N}{number of nearest neighbors to consider} \item{groups}{a character or factor variable equal in length to the number of documents} -\item{numerator}{(character)} +\item{numerator}{(character) defines which group is the nuemerator in the ratio.} \item{candidates}{character vector delimiting the features to consider as nearest neighbor candidates} @@ -54,8 +52,7 @@ the highest average over all bootstrap samples.} \item{num_permutations}{(numeric) number of permutations to use} -\item{what}{character; which quanteda tokenizer to use. You will rarely want to change this. -For Chinese texts you may want to set what = 'fastestword'.} +\item{verbose}{provide information on which group is the numerator} } \value{ a \code{data.frame} with following columns: @@ -82,12 +79,17 @@ pattern = "immigration", window = 6L, verbose = TRUE) +# identify features in local vocab that overlap with pre-trained embeddings +# these are used to define candidate nearest neighbors below (optional) local_vocab <- get_local_vocab(as.character(immig_corpus), pre_trained = glove_subset) +# tokenize +immig_toks <- tokens(immig_corpus) + set.seed(42L) -get_nns_ratio(x = immig_corpus, +temp <- get_nns_ratio(x = immig_toks, N = 20, -groups = docvars(immig_corpus, 'party'), +groups = docvars(immig_toks, 'party'), numerator = "R", candidates = local_vocab, pre_trained = glove_subset, @@ -96,7 +98,7 @@ transform_matrix = khodakA, bootstrap = TRUE, num_bootstraps = 10, permute = TRUE, -num_permutations = 10) - +num_permutations = 10, +verbose = TRUE) } \keyword{get_nns_ratio} diff --git a/man/nns_ratio.Rd b/man/nns_ratio.Rd index 263cdee..a5f74df 100644 --- a/man/nns_ratio.Rd +++ b/man/nns_ratio.Rd @@ -5,12 +5,21 @@ \title{Computes the ratio of cosine similarities for two embeddings over the union of their respective top N nearest neighbors.} \usage{ -nns_ratio(x, N = 10, candidates = character(0), pre_trained) +nns_ratio( + x, + N = 10, + numerator = NULL, + candidates = character(0), + pre_trained, + verbose = TRUE +) } \arguments{ \item{x}{a \linkS4class{dem} or \linkS4class{fem} object or in general a matrix of embeddings} -\item{N}{number of nearest neighbors to consider +\item{N}{number of nearest neighbors to consider} + +\item{numerator}{(character) defines which group is the nuemerator in the ratio Note: if nearest neighbors overlap, the resulting number of features will be fewer than 2*N.} @@ -19,6 +28,8 @@ will be fewer than 2*N.} \item{pre_trained}{a F x D matrix of numeric values corresponding to pretrained embeddings F = number of features and D = embedding dimensions. rownames(pre_trained) = set of features for which there is a pre-trained embedding} + +\item{verbose}{provide information on which group is the numerator} } \value{ a \code{data.frame} with following columns: @@ -58,6 +69,6 @@ verbose = FALSE) immig_dem_party <- dem_group(immig_dem, groups = immig_dem@docvars$party) # find nearest neighbors -nns_ratio(x = immig_dem_party, pre_trained = glove_subset, candidates = character(0), N = 10) +nns_ratio(x = immig_dem_party, N = 10, numerator = "R", candidates = character(0), pre_trained = glove_subset, verbose = TRUE) } \keyword{nns_ratio}