conText 1.1.0 beta release

Bug fixes: - removed issue with pattern prefix - can now use tokens_context w/o docvars New functions: - get_ncs - ncs Other: - added NOMINATE to sample data - conText can now take continuous covariates
prodriguezsosa · Mar 27, 2022 · 1514bbc · 1514bbc
1 parent 539cea2
commit 1514bbc
Show file tree

Hide file tree

Showing 34 changed files with 1,263 additions and 411 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: conText
-Version: 1.0.0
+Version: 1.1.0
 Title: 'a la Carte' on Text (ConText) Embedding Regression
 Description: A fast, flexible and transparent framework to estimate context-specific word and short document embeddings using the 'a la carte' 
     embeddings approach developed by Khodak et al. (2018) <arXiv:1805.05388> and evaluate hypotheses about covariate effects on embeddings using 
@@ -21,6 +21,7 @@ Imports:
     reshape2 (>= 1.4.4),
     fastDummies (>= 1.6.3),
     stringr (>= 1.4.0),
+    tidyr (>= 1.1.3),
     ggplot2,
     methods
 URL: https://github.com/prodriguezsosa/EmbeddingRegression
@@ -33,5 +34,8 @@ Language: en-US
 RoxygenNote: 7.1.1
 Roxygen: list(markdown = TRUE)
 Suggests: 
+    SnowballC (>= 0.7.0),
+    hunspell,
     knitr,
-    rmarkdown
+    rmarkdown,
+    formatR
diff --git a/NAMESPACE b/NAMESPACE
@@ -16,9 +16,11 @@ export(find_nns)
 export(get_context)
 export(get_cos_sim)
 export(get_local_vocab)
+export(get_ncs)
 export(get_nns)
 export(get_nns_ratio)
 export(get_seq_cos_sim)
+export(ncs)
 export(nns)
 export(nns_ratio)
 export(permute_contrast)

diff --git a/R/conText.R b/R/conText.R
@@ -4,19 +4,21 @@
 #' intervals and a permutation test for inference (see https://github.com/prodriguezsosa/conText for details.)
 #'
 #' @param formula a symbolic description of the model to be fitted with a target word as a DV e.g.
-#' `immigrant ~ party + gender`. To use a phrase as a DV, place it quotations e.g.
+#' `immigrant ~ party + gender`. To use a phrase as a DV, place it in quotations e.g.
 #' `"immigrant refugees" ~ party + gender`. To use all covariates included in the data,
 #' you can use `.` on RHS, e.g.`immigrant ~ .`. If you wish to treat the full document as you DV, rather
-#' than a single target word, use `.` on the LHS e.g. `. ~ party + gender`.
+#' than a single target word, use `.` on the LHS e.g. `. ~ party + gender`. If you wish to use all covariates
+#' on the RHS use `immigrant ~ .`. Any `character` or `factor` covariates will automatically be converted
+#' to a set of binary (`0/1`s) indicator variables for each group, leaving the first level out of the regression.
 #' @param data a quanteda `tokens-class` object with the necessary document variables. Covariates must be
 #' either binary indicator variables or "trasnformable" into binary indicator variables. conText will automatically
 #' transform any non-indicator variables into binary indicator variables (multiple if more than 2 classes),
 #' leaving out a "base" category.
 #' @inheritParams dem
 #' @param bootstrap (logical) if TRUE, use bootstrapping -- sample from texts with replacement and
 #' re-run regression on each sample. Required to get std. errors.
-#' @param num_bootstraps (numeric) number of bootstraps to use
-#' @param stratify (logical) if TRUE, stratify by covariates when bootstrapping
+#' @param num_bootstraps (numeric) number of bootstraps to use.
+#' @param stratify (logical) if TRUE, stratify by discrete covariates when bootstrapping.
 #' @param permute (logical) if TRUE, compute empirical p-values using permutation test
 #' @param num_permutations (numeric) number of permutations to use
 #' @inheritParams tokens_context
@@ -44,22 +46,19 @@
 #' # tokenize corpus
 #' toks <- tokens(cr_sample_corpus)
 #'
-#' # build a tokenized corpus of contexts sorrounding a target term
-#' immig_toks <- tokens_context(x = toks, pattern = "immigr*", window = 6L)
-#'
 #' ## given the target word "immigration"
 #' set.seed(2021L)
 #' model1 <- conText(formula = immigration ~ party + gender,
 #'                  data = toks,
 #'                  pre_trained = cr_glove_subset,
 #'                  transform = TRUE, transform_matrix = cr_transform,
 #'                  bootstrap = TRUE, num_bootstraps = 10,
-#'                  stratify = TRUE,
+#'                  stratify = FALSE,
 #'                  permute = TRUE, num_permutations = 100,
 #'                  window = 6, case_insensitive = TRUE,
 #'                  verbose = FALSE)
 #'
-#' # notice, non-binary covariates are automatically "dummified"
+#' # notice, character/factor covariates are automatically "dummified"
 #' rownames(model1)
 #'
 #' # the beta coefficient 'partyR' in this case corresponds to the alc embedding
@@ -68,52 +67,51 @@
 #' # (normed) coefficient table
 #' model1@normed_cofficients
 #'
-conText <- function(formula, data, pre_trained, transform = TRUE, transform_matrix, bootstrap = TRUE, num_bootstraps = 20, stratify = TRUE, permute = TRUE, num_permutations = 100, window = 6L, valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, hard_cut = FALSE, verbose = TRUE){
+conText <- function(formula, data, pre_trained, transform = TRUE, transform_matrix, bootstrap = TRUE, num_bootstraps = 100, stratify = FALSE, permute = TRUE, num_permutations = 100, window = 6L, valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, hard_cut = FALSE, verbose = TRUE){
 
   # initial checks
-  if(class(data)[1] != "tokens") stop("data must be of class tokens")
-  if(!transform & !is.null(transform_matrix)) warning("Warning: transform = FALSE means transform_matrix argument was ignored. If that was not your intention, use transform = TRUE.")
+  if(class(data)[1] != "tokens") stop("data must be of class tokens", call. = FALSE)
+  if(!transform && !is.null(transform_matrix)) warning('Warning: transform = FALSE means transform_matrix argument was ignored. If that was not your intention, use transform = TRUE.', call. = FALSE)
+  if(any(grepl("factor\\(|\\)", formula))) stop('It seems you are using factor() in "formula" to create a factor a variable. \n Please create it directly in "data" and re-run conText.', call. = FALSE) # pre-empt users using lm type notation
 
   # extract dependent variable
   target <- as.character(formula[[2]])
+  if(length(target) > 1) target <- target[2:length(target)]
 
   # mirror lm convention: if DV is "." then full text is embedded, ow find and embed the context around DV
   if(target != "."){
 
     # create a corpus of contexts
     toks <- tokens_context(x = data, pattern = target, window = window, valuetype = valuetype, case_insensitive = case_insensitive, hard_cut = hard_cut, verbose = verbose)
+    docvars <- quanteda::docvars(toks) %>% dplyr::select(-pattern)
 
   }else{
     toks <- data
+    docvars <- quanteda::docvars(toks)
   }
 
   #----------------------
   # COVARIATES
   #----------------------
 
   # extract covariates names
-  docvars <- quanteda::docvars(toks)
   if(formula[[3]] == "."){covariates <- names(docvars)}else{ # follows lm convention, if DV = ., regress on all variables in data
     covariates <- setdiff(stringr::str_squish(unlist(strsplit(as.character(formula[[3]]), '+', fixed = TRUE))), '') # to allow for phrase DVs
-    if(any(!(covariates %in% names(docvars))))stop("one or more of the covariates could not be found in the data.")
+    covs_not_in_data <- covariates[!(covariates %in% names(docvars))]
+    if(length(covs_not_in_data) > 0) stop("the following covariates could not be found in the data: ", paste0(covs_not_in_data, collapse = ", "))
   }
 
   # select covariates
   cov_vars <- docvars %>% dplyr::select(dplyr::all_of(covariates))
 
-  # check covariates are binary dummy variables
-  indicator_check <- apply(cov_vars, 2, function(i) all((class(i) %in% c('integer', 'numeric')) & (length(setdiff(i, c(0,1))) == 0)))
-
-  # if there are non-indicator variables
-  if(!all(indicator_check)){
-    non_indicator_vars <- names(which(indicator_check == FALSE))
-
-    # check whether they can be "dummified" (i.e. must be character or factor variables)
-    class_check <- sapply(dplyr::tibble(cov_vars[,non_indicator_vars]), function(i) is.character(i) | is.factor(i))
+  # check which covariates are binary dummy variables
+  numeric_vars <- c(names(which(sapply(cov_vars, is.numeric))), names(which(sapply(cov_vars, is.integer))))
+  non_numeric_vars <- setdiff(covariates, numeric_vars)
 
-    # if they can be "dummified", do so (see: https://cran.r-project.org/web/packages/fastDummies/fastDummies.pdf)
-    if(!all(class_check))stop("covariates must be either a binary indicator variable (0/1s), a character variable or a factor variable")
-    cov_vars <- fastDummies::dummy_cols(cov_vars, select_columns = non_indicator_vars, remove_first_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE)
+  # dummify non-numeric/integer variables
+  # see: https://cran.r-project.org/web/packages/fastDummies/fastDummies.pdf
+  if(length(non_numeric_vars)>0){
+    cov_vars <- fastDummies::dummy_cols(cov_vars, select_columns = non_numeric_vars, remove_first_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE)
   }
 
   # add intercept
@@ -256,10 +254,26 @@ bootstrap_ols <- function(Y = NULL, X = NULL, stratify = NULL){
   X_bs <- cbind(obs = 1:nrow(X), X)
 
   # sample observations with replacement
-  if(stratify) X_bs <- X_bs %>% dplyr::group_by_at(setdiff(names(X), "(Intercept)")) %>% dplyr::sample_n(size = dplyr::n(), replace = TRUE) %>% dplyr::ungroup() else{
+  if (stratify) {
+
+    # identify discrete covariates to stratify over
+    discrete_vars <- setdiff(colnames(X_bs)[sapply(X_bs, is.numeric)], c("obs", "(Intercept)"))
+
+    # sample with stratification if there are discrete variables to stratify over
+    if(length(discrete_vars) > 0) X_bs <- X_bs %>% dplyr::group_by_at(discrete_vars) %>% dplyr::sample_n(size = dplyr::n(), replace = TRUE) %>% dplyr::ungroup()
+    else{
+      warning('no discrete covariate to stratify over. Will proceed without stratifying.', call. = FALSE)
+      X_bs <- dplyr::sample_n(X_bs, size = nrow(X_bs), replace = TRUE)
+    }
+
+  } else{
+
     X_bs <- dplyr::sample_n(X_bs, size = nrow(X_bs), replace = TRUE)
+
   }
 
+  #X_bs <- dplyr::sample_n(X_bs, size = nrow(X_bs), replace = TRUE)
+
   # subset Y to sampled observations
   Y_bs <- Y[X_bs$obs,]
 
@@ -299,4 +313,3 @@ run_ols <- function(Y = NULL, X = NULL){
   return(list('betas' = betas, 'normed_betas' = normed_betas))
 
 }
-
diff --git a/R/cos_sim.R b/R/cos_sim.R
@@ -5,14 +5,19 @@
 #' F = number of features and D = embedding dimensions.
 #' rownames(pre_trained) = set of features for which there is a pre-trained embedding.
 #' @param features (character) features of interest.
+#' @param stem (logical) - If TRUE, both `features` and `rownames(pre_trained)`
+#' are stemmed and average cosine similarities are reported.
+#' We recommend you remove misspelled words from  `pre_trained` as these can
+#' significantly influence the average.
 #' @param as_list (logical) if FALSE all results are combined into a single data.frame
 #' If TRUE, a list of data.frames is returned with one data.frame per feature.
 #'
 #' @return a `data.frame` or list of data.frames (one for each target)
 #' with the following columns:
 #' \describe{
 #'  \item{`target`}{ (character) rownames of `x`,
-#'  the labels of the ALC embeddings.}
+#'  the labels of the ALC embeddings.
+#'  NA if is.null(rownames(x)).}
 #'  \item{`feature`}{(character) feature terms defined in
 #'  the `features` argument.}
 #'  \item{`value`}{(numeric) cosine similarity between `x`
@@ -43,19 +48,40 @@
 #' immig_wv_party <- dem_group(immig_dem, groups = immig_dem@docvars$party)
 #'
 #' # compute the cosine similarity between each party's embedding and a specific set of features
-#' cos_sim(immig_wv_party, pre_trained = cr_glove_subset,
+#' cos_sim(x = immig_wv_party, pre_trained = cr_glove_subset,
 #' features = c('reform', 'enforcement'), as_list = FALSE)
-cos_sim <- function(x, pre_trained, features = NULL, as_list = TRUE){
+cos_sim <- function(x, pre_trained, features = NULL, stem = FALSE, as_list = TRUE){
+
+  # for single numeric vectors
+  if(is.null(dim(x)) && length(x) == dim(pre_trained)[2]) x <- matrix(x, nrow = 1)
 
   # check features are in pre-trained embeddings
-  feature_check <- features %in% rownames(pre_trained)
-  if(!all(feature_check)) stop('the following features do not appear to have an embedding in the set of pre-trained embeddings provided: ', paste(features[which(!feature_check)], collapse = ', '))
+  if(stem){
+    if (requireNamespace("SnowballC", quietly = TRUE)) {
+      features <- SnowballC::wordStem(features)
+      pre_trained_feats <- SnowballC::wordStem(rownames(pre_trained))
+    } else {
+      stem <- FALSE
+      pre_trained_feats <- rownames(pre_trained)
+      warning('"SnowballC (>= 0.7.0)" package must be installed to use stemmming option. Will proceed without stemming.', call. = FALSE)
+    }
+  } else pre_trained_feats <- rownames(pre_trained)
+  feature_check <- features %in% pre_trained_feats
+
+  # check if any of the features are present, if none, stop
+  if(!any(feature_check)) stop('none of features appear to have an embedding in the set of pre-trained embeddings provided, please select other features.', call. = FALSE)
+  if(!all(feature_check)) warning('the following features do not appear to have an embedding in the set of pre-trained embeddings provided: ', paste(features[which(!feature_check)], collapse = ', '))
 
   # compute cosine similarity
-  cos_sim <- text2vec::sim2(x, matrix(pre_trained[features,], ncol = ncol(pre_trained), dimnames = list(features)), method = 'cosine', norm = 'l2')
+  cos_sim <- text2vec::sim2(x, pre_trained, method = 'cosine', norm = 'l2')
 
   # convert to dataframe
-  result <- reshape2::melt(as.matrix(cos_sim)) %>% setNames(c('target', 'feature', 'value'))
+  cos_sim <- reshape2::melt(as.matrix(cos_sim)) %>% setNames(c('target', 'feature', 'value'))
+  if(is.null(rownames(x))) cos_sim$target <- NA
+
+  # stemming
+  if(stem) result <- cos_sim %>% dplyr::mutate(feature = SnowballC::wordStem(feature)) %>% dplyr::group_by(target, feature) %>% dplyr::summarise(dplyr::across(where(is.numeric), mean), .groups = "drop") %>% dplyr::ungroup() %>% dplyr::filter(feature %in% features)
+  else result <- cos_sim %>% dplyr::filter(feature %in% features)
 
   # if !as_list return a list object with an item for each feature data.frame
   if(as_list) result <- lapply(unique(result$feature), function(i) result[result$feature == i,] %>% dplyr::mutate(feature = as.character(feature))) %>% setNames(unique(result$feature))

diff --git a/R/data.R b/R/data.R
@@ -31,12 +31,13 @@
 #' containing the regular expression "immig*". Then 100 docs from each party-gender pair
 #' is randomly sampled. For full data and pre-processing file, see:
 #' https://www.dropbox.com/sh/jsyrag7opfo7l7i/AAB1z7tumLuKihGu2-FDmhmKa?dl=0
+#' For nominate scores see: https://voteview.com/data
 #'
 #' @format A quanteda corpus with 200 documents and 3 docvars:
 #' \describe{
 #'   \item{party}{party of speaker, (D)emocrat or (R)epublican}
 #'   \item{gender}{gender of speaker, (F)emale or (M)ale}
-#'   \item{session_id}{id of Congress session in which speech was given}
+#'   \item{nominate_dim1}{dimension 1 of the nominate score}
 #'   ...
 #' }
 #' @source \url{https://data.stanford.edu/congress_text}

diff --git a/R/dem.R b/R/dem.R
@@ -84,7 +84,7 @@ dem <- function(x, pre_trained, transform = TRUE, transform_matrix, verbose = TR
                      docvars = quanteda::docvars(x)[included,,drop=FALSE],
                      features = overlapping_features,
                      Dimnames = list(
-                       docs = rownames(x)[included],
+                       docs = as.character(quanteda::docid(x)[included]),
                        columns = NULL))
 
 

diff --git a/R/find_nns.R b/R/find_nns.R
@@ -11,6 +11,10 @@
 #'   \item{`"l2"`}{cosine similarity}
 #'   \item{`"none"`}{inner product}
 #'   }
+#' @param stem (logical) - whether to stem candidates when evaluating nns. Default is FALSE.
+#' If TRUE, candidate stems are ranked by their average cosine similarity to the target.
+#' We recommend you remove misspelled words from candidate set `candidates` as these can
+#' significantly influence the average.
 #'
 #' @return (character) vector of nearest neighbors to target
 #' @export
@@ -20,10 +24,14 @@
 # find nearest neighbors
 #' find_nns(target_embedding = cr_glove_subset['immigration',],
 #'          pre_trained = cr_glove_subset, N = 5,
-#'          candidates = NULL, norm = "l2")
-find_nns <- function(target_embedding, pre_trained, N = 5, candidates = NULL, norm = "l2"){
-  if(is.null(candidates)) cos_sim <- text2vec::sim2(x = pre_trained, y = matrix(target_embedding, nrow = 1), method = "cosine", norm = norm)
-  if(!is.null(candidates)) cos_sim <- text2vec::sim2(x = pre_trained[candidates,], y = matrix(target_embedding, nrow = 1), method = "cosine", norm = norm)
-  nn <- cos_sim[order(-cos_sim),]
-  return(names(nn)[1:N])
+#'          candidates = NULL, norm = "l2", stem = FALSE)
+find_nns <- function(target_embedding, pre_trained, N = 5, candidates = NULL, norm = "l2", stem = FALSE){
+  if(is.null(candidates)) cos_sim <- text2vec::sim2(x = pre_trained, y = matrix(target_embedding, nrow = 1), method = "cosine", norm = norm)[,1]
+  if(!is.null(candidates)) cos_sim <- text2vec::sim2(x = pre_trained[candidates,], y = matrix(target_embedding, nrow = 1), method = "cosine", norm = norm)[,1]
+  nn_df <- data.frame(token = names(cos_sim), value = unname(cos_sim)) %>% dplyr::arrange(-value)
+  if(stem){
+    if (requireNamespace("SnowballC", quietly = TRUE)) nn_df <- nn_df %>% dplyr::mutate(token = SnowballC::wordStem(token)) %>% dplyr::group_by(token) %>% dplyr::summarize(value = mean(value)) %>% dplyr::arrange(-value) %>% dplyr::ungroup()
+    else warning('"SnowballC (>= 0.7.0)" package must be installed to use stemmming option. Will proceed without stemming.', call. = FALSE)
+  }
+  return(nn_df$token[1:N])
 }
diff --git a/R/get_cos_sim.R b/R/get_cos_sim.R
@@ -54,6 +54,7 @@
 #'            transform_matrix = cr_transform,
 #'             bootstrap = TRUE,
 #'             num_bootstraps = 10,
+#'             stem = TRUE,
 #'             as_list = FALSE)
 get_cos_sim <- function(x,
                         groups = NULL,
@@ -63,11 +64,11 @@ get_cos_sim <- function(x,
                         transform_matrix,
                         bootstrap = TRUE,
                         num_bootstraps = 10,
-                        as_list = TRUE,
-                        verbose = TRUE) {
+                        stem = FALSE,
+                        as_list = TRUE) {
 
   # initial checks
-  if(class(x)[1] != "tokens") stop("data must be of class tokens")
+  if(class(x)[1] != "tokens") stop("data must be of class tokens", call. = FALSE)
 
   # add grouping variable to docvars
   if(!is.null(groups)) quanteda::docvars(x) <- NULL; quanteda::docvars(x, "group") <- groups
@@ -80,6 +81,7 @@ get_cos_sim <- function(x,
                                            pre_trained = pre_trained,
                                            transform = transform,
                                            transform_matrix = transform_matrix,
+                                           stem = stem,
                                            as_list = FALSE),
                           simplify = FALSE)
     result <- do.call(rbind, cossimdf_bs) %>%
@@ -105,7 +107,7 @@ get_cos_sim <- function(x,
     }
 
     # compute cosine similarity
-    result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features)
+    result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, stem = stem, as_list = FALSE)
   }
 
   # if !as_list return a list object with an item for each feature data.frame
@@ -121,6 +123,7 @@ cos_sim_boostrap <- function(x,
                              pre_trained,
                              transform = TRUE,
                              transform_matrix,
+                             stem = stem,
                              as_list = FALSE){
 
   # sample tokens with replacement
@@ -144,7 +147,7 @@ cos_sim_boostrap <- function(x,
   }
 
   # compute cosine similarity
-  result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, as_list = FALSE)
+  result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, stem = stem, as_list = FALSE)
 
   return(result)