Skip to content

Commit

Permalink
conText 1.1.0 beta release
Browse files Browse the repository at this point in the history
 Bug fixes:
- removed issue with pattern prefix
- can now use tokens_context w/o docvars

New functions:
- get_ncs
- ncs

Other:
- added NOMINATE to sample data
- conText can now take continuous covariates
  • Loading branch information
prodriguezsosa committed Mar 27, 2022
1 parent 539cea2 commit 1514bbc
Show file tree
Hide file tree
Showing 34 changed files with 1,263 additions and 411 deletions.
Binary file modified .DS_Store
Binary file not shown.
8 changes: 6 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: conText
Version: 1.0.0
Version: 1.1.0
Title: 'a la Carte' on Text (ConText) Embedding Regression
Description: A fast, flexible and transparent framework to estimate context-specific word and short document embeddings using the 'a la carte'
embeddings approach developed by Khodak et al. (2018) <arXiv:1805.05388> and evaluate hypotheses about covariate effects on embeddings using
Expand All @@ -21,6 +21,7 @@ Imports:
reshape2 (>= 1.4.4),
fastDummies (>= 1.6.3),
stringr (>= 1.4.0),
tidyr (>= 1.1.3),
ggplot2,
methods
URL: https://github.com/prodriguezsosa/EmbeddingRegression
Expand All @@ -33,5 +34,8 @@ Language: en-US
RoxygenNote: 7.1.1
Roxygen: list(markdown = TRUE)
Suggests:
SnowballC (>= 0.7.0),
hunspell,
knitr,
rmarkdown
rmarkdown,
formatR
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ export(find_nns)
export(get_context)
export(get_cos_sim)
export(get_local_vocab)
export(get_ncs)
export(get_nns)
export(get_nns_ratio)
export(get_seq_cos_sim)
export(ncs)
export(nns)
export(nns_ratio)
export(permute_contrast)
Expand Down
69 changes: 41 additions & 28 deletions R/conText.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,21 @@
#' intervals and a permutation test for inference (see https://github.com/prodriguezsosa/conText for details.)
#'
#' @param formula a symbolic description of the model to be fitted with a target word as a DV e.g.
#' `immigrant ~ party + gender`. To use a phrase as a DV, place it quotations e.g.
#' `immigrant ~ party + gender`. To use a phrase as a DV, place it in quotations e.g.
#' `"immigrant refugees" ~ party + gender`. To use all covariates included in the data,
#' you can use `.` on RHS, e.g.`immigrant ~ .`. If you wish to treat the full document as you DV, rather
#' than a single target word, use `.` on the LHS e.g. `. ~ party + gender`.
#' than a single target word, use `.` on the LHS e.g. `. ~ party + gender`. If you wish to use all covariates
#' on the RHS use `immigrant ~ .`. Any `character` or `factor` covariates will automatically be converted
#' to a set of binary (`0/1`s) indicator variables for each group, leaving the first level out of the regression.
#' @param data a quanteda `tokens-class` object with the necessary document variables. Covariates must be
#' either binary indicator variables or "trasnformable" into binary indicator variables. conText will automatically
#' transform any non-indicator variables into binary indicator variables (multiple if more than 2 classes),
#' leaving out a "base" category.
#' @inheritParams dem
#' @param bootstrap (logical) if TRUE, use bootstrapping -- sample from texts with replacement and
#' re-run regression on each sample. Required to get std. errors.
#' @param num_bootstraps (numeric) number of bootstraps to use
#' @param stratify (logical) if TRUE, stratify by covariates when bootstrapping
#' @param num_bootstraps (numeric) number of bootstraps to use.
#' @param stratify (logical) if TRUE, stratify by discrete covariates when bootstrapping.
#' @param permute (logical) if TRUE, compute empirical p-values using permutation test
#' @param num_permutations (numeric) number of permutations to use
#' @inheritParams tokens_context
Expand Down Expand Up @@ -44,22 +46,19 @@
#' # tokenize corpus
#' toks <- tokens(cr_sample_corpus)
#'
#' # build a tokenized corpus of contexts sorrounding a target term
#' immig_toks <- tokens_context(x = toks, pattern = "immigr*", window = 6L)
#'
#' ## given the target word "immigration"
#' set.seed(2021L)
#' model1 <- conText(formula = immigration ~ party + gender,
#' data = toks,
#' pre_trained = cr_glove_subset,
#' transform = TRUE, transform_matrix = cr_transform,
#' bootstrap = TRUE, num_bootstraps = 10,
#' stratify = TRUE,
#' stratify = FALSE,
#' permute = TRUE, num_permutations = 100,
#' window = 6, case_insensitive = TRUE,
#' verbose = FALSE)
#'
#' # notice, non-binary covariates are automatically "dummified"
#' # notice, character/factor covariates are automatically "dummified"
#' rownames(model1)
#'
#' # the beta coefficient 'partyR' in this case corresponds to the alc embedding
Expand All @@ -68,52 +67,51 @@
#' # (normed) coefficient table
#' model1@normed_cofficients
#'
conText <- function(formula, data, pre_trained, transform = TRUE, transform_matrix, bootstrap = TRUE, num_bootstraps = 20, stratify = TRUE, permute = TRUE, num_permutations = 100, window = 6L, valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, hard_cut = FALSE, verbose = TRUE){
conText <- function(formula, data, pre_trained, transform = TRUE, transform_matrix, bootstrap = TRUE, num_bootstraps = 100, stratify = FALSE, permute = TRUE, num_permutations = 100, window = 6L, valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, hard_cut = FALSE, verbose = TRUE){

# initial checks
if(class(data)[1] != "tokens") stop("data must be of class tokens")
if(!transform & !is.null(transform_matrix)) warning("Warning: transform = FALSE means transform_matrix argument was ignored. If that was not your intention, use transform = TRUE.")
if(class(data)[1] != "tokens") stop("data must be of class tokens", call. = FALSE)
if(!transform && !is.null(transform_matrix)) warning('Warning: transform = FALSE means transform_matrix argument was ignored. If that was not your intention, use transform = TRUE.', call. = FALSE)
if(any(grepl("factor\\(|\\)", formula))) stop('It seems you are using factor() in "formula" to create a factor a variable. \n Please create it directly in "data" and re-run conText.', call. = FALSE) # pre-empt users using lm type notation

# extract dependent variable
target <- as.character(formula[[2]])
if(length(target) > 1) target <- target[2:length(target)]

# mirror lm convention: if DV is "." then full text is embedded, ow find and embed the context around DV
if(target != "."){

# create a corpus of contexts
toks <- tokens_context(x = data, pattern = target, window = window, valuetype = valuetype, case_insensitive = case_insensitive, hard_cut = hard_cut, verbose = verbose)
docvars <- quanteda::docvars(toks) %>% dplyr::select(-pattern)

}else{
toks <- data
docvars <- quanteda::docvars(toks)
}

#----------------------
# COVARIATES
#----------------------

# extract covariates names
docvars <- quanteda::docvars(toks)
if(formula[[3]] == "."){covariates <- names(docvars)}else{ # follows lm convention, if DV = ., regress on all variables in data
covariates <- setdiff(stringr::str_squish(unlist(strsplit(as.character(formula[[3]]), '+', fixed = TRUE))), '') # to allow for phrase DVs
if(any(!(covariates %in% names(docvars))))stop("one or more of the covariates could not be found in the data.")
covs_not_in_data <- covariates[!(covariates %in% names(docvars))]
if(length(covs_not_in_data) > 0) stop("the following covariates could not be found in the data: ", paste0(covs_not_in_data, collapse = ", "))
}

# select covariates
cov_vars <- docvars %>% dplyr::select(dplyr::all_of(covariates))

# check covariates are binary dummy variables
indicator_check <- apply(cov_vars, 2, function(i) all((class(i) %in% c('integer', 'numeric')) & (length(setdiff(i, c(0,1))) == 0)))

# if there are non-indicator variables
if(!all(indicator_check)){
non_indicator_vars <- names(which(indicator_check == FALSE))

# check whether they can be "dummified" (i.e. must be character or factor variables)
class_check <- sapply(dplyr::tibble(cov_vars[,non_indicator_vars]), function(i) is.character(i) | is.factor(i))
# check which covariates are binary dummy variables
numeric_vars <- c(names(which(sapply(cov_vars, is.numeric))), names(which(sapply(cov_vars, is.integer))))
non_numeric_vars <- setdiff(covariates, numeric_vars)

# if they can be "dummified", do so (see: https://cran.r-project.org/web/packages/fastDummies/fastDummies.pdf)
if(!all(class_check))stop("covariates must be either a binary indicator variable (0/1s), a character variable or a factor variable")
cov_vars <- fastDummies::dummy_cols(cov_vars, select_columns = non_indicator_vars, remove_first_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE)
# dummify non-numeric/integer variables
# see: https://cran.r-project.org/web/packages/fastDummies/fastDummies.pdf
if(length(non_numeric_vars)>0){
cov_vars <- fastDummies::dummy_cols(cov_vars, select_columns = non_numeric_vars, remove_first_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE)
}

# add intercept
Expand Down Expand Up @@ -256,10 +254,26 @@ bootstrap_ols <- function(Y = NULL, X = NULL, stratify = NULL){
X_bs <- cbind(obs = 1:nrow(X), X)

# sample observations with replacement
if(stratify) X_bs <- X_bs %>% dplyr::group_by_at(setdiff(names(X), "(Intercept)")) %>% dplyr::sample_n(size = dplyr::n(), replace = TRUE) %>% dplyr::ungroup() else{
if (stratify) {

# identify discrete covariates to stratify over
discrete_vars <- setdiff(colnames(X_bs)[sapply(X_bs, is.numeric)], c("obs", "(Intercept)"))

# sample with stratification if there are discrete variables to stratify over
if(length(discrete_vars) > 0) X_bs <- X_bs %>% dplyr::group_by_at(discrete_vars) %>% dplyr::sample_n(size = dplyr::n(), replace = TRUE) %>% dplyr::ungroup()
else{
warning('no discrete covariate to stratify over. Will proceed without stratifying.', call. = FALSE)
X_bs <- dplyr::sample_n(X_bs, size = nrow(X_bs), replace = TRUE)
}

} else{

X_bs <- dplyr::sample_n(X_bs, size = nrow(X_bs), replace = TRUE)

}

#X_bs <- dplyr::sample_n(X_bs, size = nrow(X_bs), replace = TRUE)

# subset Y to sampled observations
Y_bs <- Y[X_bs$obs,]

Expand Down Expand Up @@ -299,4 +313,3 @@ run_ols <- function(Y = NULL, X = NULL){
return(list('betas' = betas, 'normed_betas' = normed_betas))

}

40 changes: 33 additions & 7 deletions R/cos_sim.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,19 @@
#' F = number of features and D = embedding dimensions.
#' rownames(pre_trained) = set of features for which there is a pre-trained embedding.
#' @param features (character) features of interest.
#' @param stem (logical) - If TRUE, both `features` and `rownames(pre_trained)`
#' are stemmed and average cosine similarities are reported.
#' We recommend you remove misspelled words from `pre_trained` as these can
#' significantly influence the average.
#' @param as_list (logical) if FALSE all results are combined into a single data.frame
#' If TRUE, a list of data.frames is returned with one data.frame per feature.
#'
#' @return a `data.frame` or list of data.frames (one for each target)
#' with the following columns:
#' \describe{
#' \item{`target`}{ (character) rownames of `x`,
#' the labels of the ALC embeddings.}
#' the labels of the ALC embeddings.
#' NA if is.null(rownames(x)).}
#' \item{`feature`}{(character) feature terms defined in
#' the `features` argument.}
#' \item{`value`}{(numeric) cosine similarity between `x`
Expand Down Expand Up @@ -43,19 +48,40 @@
#' immig_wv_party <- dem_group(immig_dem, groups = immig_dem@docvars$party)
#'
#' # compute the cosine similarity between each party's embedding and a specific set of features
#' cos_sim(immig_wv_party, pre_trained = cr_glove_subset,
#' cos_sim(x = immig_wv_party, pre_trained = cr_glove_subset,
#' features = c('reform', 'enforcement'), as_list = FALSE)
cos_sim <- function(x, pre_trained, features = NULL, as_list = TRUE){
cos_sim <- function(x, pre_trained, features = NULL, stem = FALSE, as_list = TRUE){

# for single numeric vectors
if(is.null(dim(x)) && length(x) == dim(pre_trained)[2]) x <- matrix(x, nrow = 1)

# check features are in pre-trained embeddings
feature_check <- features %in% rownames(pre_trained)
if(!all(feature_check)) stop('the following features do not appear to have an embedding in the set of pre-trained embeddings provided: ', paste(features[which(!feature_check)], collapse = ', '))
if(stem){
if (requireNamespace("SnowballC", quietly = TRUE)) {
features <- SnowballC::wordStem(features)
pre_trained_feats <- SnowballC::wordStem(rownames(pre_trained))
} else {
stem <- FALSE
pre_trained_feats <- rownames(pre_trained)
warning('"SnowballC (>= 0.7.0)" package must be installed to use stemmming option. Will proceed without stemming.', call. = FALSE)
}
} else pre_trained_feats <- rownames(pre_trained)
feature_check <- features %in% pre_trained_feats

# check if any of the features are present, if none, stop
if(!any(feature_check)) stop('none of features appear to have an embedding in the set of pre-trained embeddings provided, please select other features.', call. = FALSE)
if(!all(feature_check)) warning('the following features do not appear to have an embedding in the set of pre-trained embeddings provided: ', paste(features[which(!feature_check)], collapse = ', '))

# compute cosine similarity
cos_sim <- text2vec::sim2(x, matrix(pre_trained[features,], ncol = ncol(pre_trained), dimnames = list(features)), method = 'cosine', norm = 'l2')
cos_sim <- text2vec::sim2(x, pre_trained, method = 'cosine', norm = 'l2')

# convert to dataframe
result <- reshape2::melt(as.matrix(cos_sim)) %>% setNames(c('target', 'feature', 'value'))
cos_sim <- reshape2::melt(as.matrix(cos_sim)) %>% setNames(c('target', 'feature', 'value'))
if(is.null(rownames(x))) cos_sim$target <- NA

# stemming
if(stem) result <- cos_sim %>% dplyr::mutate(feature = SnowballC::wordStem(feature)) %>% dplyr::group_by(target, feature) %>% dplyr::summarise(dplyr::across(where(is.numeric), mean), .groups = "drop") %>% dplyr::ungroup() %>% dplyr::filter(feature %in% features)
else result <- cos_sim %>% dplyr::filter(feature %in% features)

# if !as_list return a list object with an item for each feature data.frame
if(as_list) result <- lapply(unique(result$feature), function(i) result[result$feature == i,] %>% dplyr::mutate(feature = as.character(feature))) %>% setNames(unique(result$feature))
Expand Down
3 changes: 2 additions & 1 deletion R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,13 @@
#' containing the regular expression "immig*". Then 100 docs from each party-gender pair
#' is randomly sampled. For full data and pre-processing file, see:
#' https://www.dropbox.com/sh/jsyrag7opfo7l7i/AAB1z7tumLuKihGu2-FDmhmKa?dl=0
#' For nominate scores see: https://voteview.com/data
#'
#' @format A quanteda corpus with 200 documents and 3 docvars:
#' \describe{
#' \item{party}{party of speaker, (D)emocrat or (R)epublican}
#' \item{gender}{gender of speaker, (F)emale or (M)ale}
#' \item{session_id}{id of Congress session in which speech was given}
#' \item{nominate_dim1}{dimension 1 of the nominate score}
#' ...
#' }
#' @source \url{https://data.stanford.edu/congress_text}
Expand Down
2 changes: 1 addition & 1 deletion R/dem.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ dem <- function(x, pre_trained, transform = TRUE, transform_matrix, verbose = TR
docvars = quanteda::docvars(x)[included,,drop=FALSE],
features = overlapping_features,
Dimnames = list(
docs = rownames(x)[included],
docs = as.character(quanteda::docid(x)[included]),
columns = NULL))


Expand Down
20 changes: 14 additions & 6 deletions R/find_nns.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
#' \item{`"l2"`}{cosine similarity}
#' \item{`"none"`}{inner product}
#' }
#' @param stem (logical) - whether to stem candidates when evaluating nns. Default is FALSE.
#' If TRUE, candidate stems are ranked by their average cosine similarity to the target.
#' We recommend you remove misspelled words from candidate set `candidates` as these can
#' significantly influence the average.
#'
#' @return (character) vector of nearest neighbors to target
#' @export
Expand All @@ -20,10 +24,14 @@
# find nearest neighbors
#' find_nns(target_embedding = cr_glove_subset['immigration',],
#' pre_trained = cr_glove_subset, N = 5,
#' candidates = NULL, norm = "l2")
find_nns <- function(target_embedding, pre_trained, N = 5, candidates = NULL, norm = "l2"){
if(is.null(candidates)) cos_sim <- text2vec::sim2(x = pre_trained, y = matrix(target_embedding, nrow = 1), method = "cosine", norm = norm)
if(!is.null(candidates)) cos_sim <- text2vec::sim2(x = pre_trained[candidates,], y = matrix(target_embedding, nrow = 1), method = "cosine", norm = norm)
nn <- cos_sim[order(-cos_sim),]
return(names(nn)[1:N])
#' candidates = NULL, norm = "l2", stem = FALSE)
find_nns <- function(target_embedding, pre_trained, N = 5, candidates = NULL, norm = "l2", stem = FALSE){
if(is.null(candidates)) cos_sim <- text2vec::sim2(x = pre_trained, y = matrix(target_embedding, nrow = 1), method = "cosine", norm = norm)[,1]
if(!is.null(candidates)) cos_sim <- text2vec::sim2(x = pre_trained[candidates,], y = matrix(target_embedding, nrow = 1), method = "cosine", norm = norm)[,1]
nn_df <- data.frame(token = names(cos_sim), value = unname(cos_sim)) %>% dplyr::arrange(-value)
if(stem){
if (requireNamespace("SnowballC", quietly = TRUE)) nn_df <- nn_df %>% dplyr::mutate(token = SnowballC::wordStem(token)) %>% dplyr::group_by(token) %>% dplyr::summarize(value = mean(value)) %>% dplyr::arrange(-value) %>% dplyr::ungroup()
else warning('"SnowballC (>= 0.7.0)" package must be installed to use stemmming option. Will proceed without stemming.', call. = FALSE)
}
return(nn_df$token[1:N])
}
13 changes: 8 additions & 5 deletions R/get_cos_sim.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#' transform_matrix = cr_transform,
#' bootstrap = TRUE,
#' num_bootstraps = 10,
#' stem = TRUE,
#' as_list = FALSE)
get_cos_sim <- function(x,
groups = NULL,
Expand All @@ -63,11 +64,11 @@ get_cos_sim <- function(x,
transform_matrix,
bootstrap = TRUE,
num_bootstraps = 10,
as_list = TRUE,
verbose = TRUE) {
stem = FALSE,
as_list = TRUE) {

# initial checks
if(class(x)[1] != "tokens") stop("data must be of class tokens")
if(class(x)[1] != "tokens") stop("data must be of class tokens", call. = FALSE)

# add grouping variable to docvars
if(!is.null(groups)) quanteda::docvars(x) <- NULL; quanteda::docvars(x, "group") <- groups
Expand All @@ -80,6 +81,7 @@ get_cos_sim <- function(x,
pre_trained = pre_trained,
transform = transform,
transform_matrix = transform_matrix,
stem = stem,
as_list = FALSE),
simplify = FALSE)
result <- do.call(rbind, cossimdf_bs) %>%
Expand All @@ -105,7 +107,7 @@ get_cos_sim <- function(x,
}

# compute cosine similarity
result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features)
result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, stem = stem, as_list = FALSE)
}

# if !as_list return a list object with an item for each feature data.frame
Expand All @@ -121,6 +123,7 @@ cos_sim_boostrap <- function(x,
pre_trained,
transform = TRUE,
transform_matrix,
stem = stem,
as_list = FALSE){

# sample tokens with replacement
Expand All @@ -144,7 +147,7 @@ cos_sim_boostrap <- function(x,
}

# compute cosine similarity
result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, as_list = FALSE)
result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, stem = stem, as_list = FALSE)

return(result)

Expand Down
Loading

0 comments on commit 1514bbc

Please sign in to comment.