diff --git a/.DS_Store b/.DS_Store
index ebc5503..0632c31 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/R/classes.R b/R/classes.R
index a9520d2..bdb4bdf 100644
--- a/R/classes.R
+++ b/R/classes.R
@@ -13,7 +13,7 @@ setClass("dem",
            docvars = "data.frame",
            features = "character"),
          prototype = list(Dim = integer(2),
-                          Dimnames = list(rows = character(), columns = NULL),
+                          Dimnames = list(docs = character(), columns = NULL),
                           docvars = data.frame(row.names = integer()),
                           features = character()),
          contains = "dgCMatrix")
diff --git a/R/get_cos_sim.R b/R/get_cos_sim.R
index 5dfef73..8a1d75e 100644
--- a/R/get_cos_sim.R
+++ b/R/get_cos_sim.R
@@ -1,7 +1,7 @@
 #' Given a corpus and a set of features, calculate cosine similarities over
 #' a grouping variable.
 #'
-#' @param x a (quanteda) corpus or character vector
+#' @param x a (quanteda) tokens object
 #' @param groups a grouping variable
 #' @param features (character) features of interest
 #' @inheritParams dem
@@ -11,8 +11,6 @@
 #' if groups defined, sampling is automatically stratified; top nns are those with
 #' the highest average over all bootstrap samples.
 #' @param num_bootstraps (integer) number of bootstraps to use
-#' @param what character; which quanteda tokenizer to use. You will rarely want to change this.
-#' For Chinese texts you may want to set what = 'fastestword'.
 #'
 #' @return a `data.frame` or list of data.frames (one for each target)
 #' with the following columns:
@@ -36,8 +34,10 @@
 #' immig_corpus <- corpus_context(x = cr_sample_corpus,
 #' pattern = "immigration", window = 6L, verbose = TRUE)
 #'
-#' get_cos_sim(x = immig_corpus,
-#' groups = docvars(immig_corpus, 'party'),
+#' immig_toks <- tokens(immig_corpus)
+#'
+#' get_cos_sim(x = immig_toks,
+#' groups = docvars(immig_toks, "party"),
 #' features = c("reform", "enforce"),
 #' pre_trained = glove_subset,
 #' transform = TRUE,
@@ -54,22 +54,20 @@ get_cos_sim <- function(x,
                         transform_matrix,
                         bootstrap = TRUE,
                         num_bootstraps = 10,
-                        what = 'word',
                         as_list = TRUE,
                         verbose = TRUE) {
 
-  # create a new corpus
-  x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups))
+  # add grouping variable to docvars
+  if(!is.null(groups)) docvars(x) <- NULL; docvars(x, "group") <- groups
 
   if(bootstrap){
     cossimdf_bs <- replicate(num_bootstraps,
                           cos_sim_boostrap(x = x,
-                                           groups = quanteda::docvars(x, 'group'),
+                                           groups = groups,
                                            features = features,
                                            pre_trained = pre_trained,
                                            transform = transform,
                                            transform_matrix = transform_matrix,
-                                           what = what,
                                            as_list = FALSE),
                           simplify = FALSE)
     result <- do.call(rbind, cossimdf_bs) %>%
@@ -81,23 +79,21 @@ get_cos_sim <- function(x,
       dplyr::select('target', 'feature', 'value', 'std.error')
   }else{
 
-    # create a new corpus
-    x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups))
-
-    # tokenize texts
-    corpus_toks <- quanteda::tokens(x, what = what)
-
     # create document-feature matrix
-    corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
+    x_dfm <- quanteda::dfm(x, tolower = FALSE)
 
     # compute document-embedding matrix
-    corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform_matrix = transform_matrix, transform = transform, verbose = verbose)
+    x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
 
     # aggregate dems by group var
-    if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
+    if(!is.null(groups)){
+      wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
+    } else {
+      wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem))
+    }
 
     # compute cosine similarity
-    result <- cos_sim(x = corpus_dem, pre_trained = pre_trained, features = features)
+    result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features)
   }
 
   # if !as_list return a list object with an item for each feature data.frame
@@ -113,25 +109,30 @@ cos_sim_boostrap <- function(x,
                              pre_trained,
                              transform = TRUE,
                              transform_matrix,
-                             what = what,
                              as_list = FALSE){
-  # create a new corpus
-  x <- quanteda::corpus_sample(x, size = quanteda::ndoc(x), replace = TRUE, by = groups)
 
-  # tokenize texts
-  corpus_toks <- quanteda::tokens(x, what = what)
+  # sample tokens with replacement
+  if(!is.null(groups)) {
+    x <- quanteda::tokens_sample(x = x, size = table(groups), replace = TRUE, by = groups)
+  } else {
+    x <- quanteda::tokens_sample(x = x, size = quanteda::ndoc(x), replace = TRUE)
+  }
 
   # create document-feature matrix
-  corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
+  x_dfm <- quanteda::dfm(x, tolower = FALSE)
 
   # compute document-embedding matrix
-  corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
+  x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
 
-  # aggregate dems by group var
-  if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
+  # aggregate dems by group var if defined
+  if(!is.null(groups)){
+    wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
+  } else {
+    wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem))
+  }
 
   # compute cosine similarity
-  result <- cos_sim(x = corpus_dem, pre_trained = pre_trained, features = features, as_list = FALSE)
+  result <- cos_sim(x = wvs, pre_trained = pre_trained, features = features, as_list = FALSE)
 
   return(result)
 
diff --git a/R/get_nns.R b/R/get_nns.R
index 71b68f6..65d2a43 100644
--- a/R/get_nns.R
+++ b/R/get_nns.R
@@ -1,7 +1,7 @@
 #' Given a corpus and a set of candidate neighbors, find the top N nearest
 #' neighbors.
 #'
-#' @param x a (quanteda) corpus or character vector
+#' @param x a (quanteda) tokens object
 #' @inheritParams nns
 #' @param groups grouping variable equal in length to the number of documents
 #' @inheritParams dem
@@ -10,8 +10,6 @@
 #' if groups defined, sampling is automatically stratified; top nns are those with
 #' the highest average over all bootstrap samples.
 #' @param num_bootstraps (integer) number of bootstraps to use
-#' @param what character; which quanteda tokenizer to use. You will rarely want to change this.
-#' For Chinese texts you may want to set what = 'fastestword'.
 #'
 #' @return a `data.frame` or list of data.frames (one for each target)
 #' with the following columns:
@@ -34,8 +32,10 @@
 #' immig_corpus <- corpus_context(x = cr_sample_corpus,
 #' pattern = "immigration", window = 6L, verbose = TRUE)
 #'
-#' get_nns(x = immig_corpus, N = 10,
-#' groups = docvars(immig_corpus, 'party'),
+#' immig_toks <- tokens(immig_corpus)
+#'
+#' get_nns(x = immig_toks, N = 10,
+#' groups = docvars(immig_toks, "party"),
 #' candidates = character(0),
 #' pre_trained = glove_subset,
 #' transform = TRUE,
@@ -52,21 +52,20 @@ get_nns <- function(x,
                     transform_matrix,
                     bootstrap = TRUE,
                     num_bootstraps = 10,
-                    what = 'word',
                     as_list = TRUE) {
 
-  # create a new corpus
-  x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups))
+  # add grouping variable to docvars
+  if(!is.null(groups)) docvars(x) <- NULL; docvars(x, "group") <- groups
 
+  # if bootstrap
   if(bootstrap){
     nnsdf_bs <- replicate(num_bootstraps,
               nns_boostrap(x = x,
-                           groups = quanteda::docvars(x, 'group'),
+                           groups = groups,
                            candidates = candidates,
                            pre_trained = pre_trained,
                            transform = transform,
                            transform_matrix = transform_matrix,
-                           what = what,
                            as_list = FALSE),
               simplify = FALSE)
     result <- do.call(rbind, nnsdf_bs) %>%
@@ -81,20 +80,21 @@ get_nns <- function(x,
       dplyr::select('target', 'feature', 'rank', 'value', 'std.error')
   }else{
 
-  # tokenize texts
-  corpus_toks <- quanteda::tokens(x, what = what)
-
   # create document-feature matrix
-  corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
+  x_dfm <- quanteda::dfm(x, tolower = FALSE)
 
   # compute document-embedding matrix
-  corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
+  x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
 
   # aggregate dems by group var
-  if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
+  if(!is.null(groups)){
+    wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
+  } else {
+    wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem))
+  }
 
   # find nearest neighbors
-  result <- nns(x = corpus_dem, N = N, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
+  result <- nns(x = wvs, N = N, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
   }
 
   # if !as_list return a list object with an item for each target data.frame
@@ -111,26 +111,30 @@ nns_boostrap <- function(x,
                          pre_trained,
                          transform = TRUE,
                          transform_matrix,
-                         what = what,
                          as_list = FALSE){
 
-  # create a new corpus
-  x <- quanteda::corpus_sample(x, size = quanteda::ndoc(x), replace = TRUE, by = groups)
-
-  # tokenize texts
-  corpus_toks <- quanteda::tokens(x, what = what)
+  # sample tokens with replacement
+  if(!is.null(groups)) {
+    x <- quanteda::tokens_sample(x = x, size = table(groups), replace = TRUE, by = groups)
+    } else {
+      x <- quanteda::tokens_sample(x = x, size = quanteda::ndoc(x), replace = TRUE)
+  }
 
   # create document-feature matrix
-  corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
+  x_dfm <- quanteda::dfm(x, tolower = FALSE)
 
   # compute document-embedding matrix
-  corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
+  x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
 
-  # aggregate dems by group var
-  if(!is.null(groups)) corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
+  # aggregate dems by group var if defined
+  if(!is.null(groups)){
+    wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
+  } else {
+    wvs <- matrix(colMeans(x_dem), ncol = ncol(x_dem))
+    }
 
   # find nearest neighbors
-  result <- nns(x = corpus_dem, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
+  result <- nns(x = wvs, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
 
   return(result)
 
diff --git a/R/get_nns_ratio.R b/R/get_nns_ratio.R
index 864ef8f..be27b51 100644
--- a/R/get_nns_ratio.R
+++ b/R/get_nns_ratio.R
@@ -1,20 +1,19 @@
 #' Given a corpus and a binary grouping variable, computes the ratio of cosine similarities
 #' over the union of their respective N nearest neighbors.
 #'
-#' @param x a (quanteda) corpus or character vector
+#' @param x a (quanteda) tokens object
 #' @inheritParams nns_ratio
 #' @inheritParams dem
 #' @inheritParams dem
 #' @inheritParams dem_group
-#' @param numerator (character)
+#' @param numerator (character) defines which group is the nuemerator in the ratio.
 #' @param bootstrap (logical) if TRUE, bootstrap nns - sample from corpus with replacement;
 #' if groups defined, sampling is automatically stratified; top nns are those with
 #' the highest average over all bootstrap samples.
 #' @param num_bootstraps (integer) number of bootstraps to use
 #' @param permute (logical) if TRUE, compute empirical p-values using permutation test
 #' @param num_permutations (numeric) number of permutations to use
-#' @param what character; which quanteda tokenizer to use. You will rarely want to change this.
-#' For Chinese texts you may want to set what = 'fastestword'.
+#' @param verbose provide information on which group is the numerator
 #'
 #' @return a `data.frame` with following columns:
 #'  \item{`feature`}{(character) vector of features from the candidate set,
@@ -39,12 +38,17 @@
 #' window = 6L,
 #' verbose = TRUE)
 #'
+#' # identify features in local vocab that overlap with pre-trained embeddings
+#' # these are used to define candidate nearest neighbors below (optional)
 #' local_vocab <- get_local_vocab(as.character(immig_corpus), pre_trained = glove_subset)
 #'
+#' # tokenize
+#' immig_toks <- tokens(immig_corpus)
+#'
 #' set.seed(42L)
-#' get_nns_ratio(x = immig_corpus,
+#' temp <- get_nns_ratio(x = immig_toks,
 #' N = 20,
-#' groups = docvars(immig_corpus, 'party'),
+#' groups = docvars(immig_toks, 'party'),
 #' numerator = "R",
 #' candidates = local_vocab,
 #' pre_trained = glove_subset,
@@ -53,11 +57,11 @@
 #' bootstrap = TRUE,
 #' num_bootstraps = 10,
 #' permute = TRUE,
-#' num_permutations = 10)
-#'
+#' num_permutations = 10,
+#' verbose = TRUE)
 get_nns_ratio <- function(x,
                           N = 10,
-                          groups = NULL,
+                          groups,
                           numerator = NULL,
                           candidates = character(0),
                           pre_trained,
@@ -67,38 +71,32 @@ get_nns_ratio <- function(x,
                           num_bootstraps = 10,
                           permute = TRUE,
                           num_permutations = 100,
-                          what = 'word'){
+                          verbose = TRUE){
 
   # checks
-  group_order <- unique(groups)
-  if(is.null(group_order)) stop("a binary grouping variable must be provided")
-  if(length(group_order)!=2) stop("groups must be binary")
+  group_vars <- unique(groups)
+  if(is.null(group_vars) | length(group_vars)!=2) stop("a binary grouping variable must be provided")
   if(!is.null(numerator)){
-    if(!(numerator %in% group_order)){
-    stop("numerator must refer to one of the two groups in the groups argument")}else{
-    group_order <- c(numerator, setdiff(group_order, numerator))}
+    if(!(numerator %in% group_vars)) stop("numerator must refer to one of the two groups in the groups argument")
   }
+  denominator <- setdiff(group_vars, numerator)
 
-  # create a new corpus
-  x <- quanteda::corpus(as.character(x), docvars = data.frame('group' = groups))
-
-  # tokenize texts
-  corpus_toks <- quanteda::tokens(x, what = what)
+  # add grouping variable to docvars
+  if(!is.null(groups)) docvars(x) <- NULL; docvars(x, "group") <- groups
 
   # create document-feature matrix
-  corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
+  x_dfm <- quanteda::dfm(x, tolower = FALSE)
 
   # compute document-embedding matrix
-  corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
+  x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
 
   # aggregate dems by group var
-  corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
-  corpus_dem <- corpus_dem[group_order,] # re-arrange groups
+  wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
 
   # get top N nns (if N is Inf or NULL, use all features)
-  nnsdfs <- nns(x = corpus_dem, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = TRUE)
-  nnsdf1 <- if(is.null(N)) nnsdfs[[1]]$feature else nnsdfs[[1]]$feature[1:N]
-  nnsdf2 <- if(is.null(N)) nnsdfs[[2]]$feature else nnsdfs[[2]]$feature[1:N]
+  nnsdfs <- nns(x = wvs, N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = TRUE)
+  nnsdf1 <- if(is.null(N)) nnsdfs[[numerator]]$feature else nnsdfs[[numerator]]$feature[1:N]
+  nnsdf2 <- if(is.null(N)) nnsdfs[[denominator]]$feature else nnsdfs[[denominator]]$feature[1:N]
 
   # get union of top N nns
   union_nns <- union(nnsdf1, nnsdf2)
@@ -106,7 +104,7 @@ get_nns_ratio <- function(x,
   if(!bootstrap){
 
     # find nearest neighbors ratio
-    result <- nns_ratio(x = corpus_dem, N = N, candidates = union_nns, pre_trained = pre_trained)
+    result <- nns_ratio(x = wvs, N = N, numerator = numerator, candidates = union_nns, pre_trained = pre_trained)
 
   }else{
 
@@ -114,13 +112,12 @@ get_nns_ratio <- function(x,
     # bootstrap ratio
     nnsratiodf_bs <- replicate(num_bootstraps,
                                nns_ratio_boostrap(x = x,
-                                       groups = quanteda::docvars(x, 'group'),
-                                       group_order = group_order,
+                                       groups = groups,
+                                       numerator = numerator,
                                        candidates = union_nns,
                                        pre_trained = pre_trained,
                                        transform = transform,
-                                       transform_matrix = transform_matrix,
-                                       what = what),
+                                       transform_matrix = transform_matrix),
                           simplify = FALSE)
     result <- do.call(rbind, nnsratiodf_bs) %>%
       dplyr::group_by(feature) %>%
@@ -141,13 +138,12 @@ get_nns_ratio <- function(x,
     # permute similarity
     cat('starting permutations \n')
     permute_out <- replicate(num_permutations, nns_ratio_permute(x,
-                                                                 groups = quanteda::docvars(x, 'group'),
-                                                                 group_order = group_order,
+                                                                 groups = groups,
+                                                                 numerator = numerator,
                                                                  candidates = union_nns,
                                                                  pre_trained = pre_trained,
                                                                  transform = transform,
-                                                                 transform_matrix = transform_matrix,
-                                                                 what = what),
+                                                                 transform_matrix = transform_matrix),
                              simplify = FALSE)
 
     # compute deviations of the observed ratios from 1
@@ -163,45 +159,40 @@ get_nns_ratio <- function(x,
 
   # add information on nns
   result <- result %>% dplyr::mutate(group = dplyr::case_when((feature %in% nnsdf1) & (feature %in% nnsdf2) ~ "shared",
-                                                              (feature %in% nnsdf1) & !(feature %in% nnsdf2) ~ group_order[1],
-                                                              !(feature %in% nnsdf1) & (feature %in% nnsdf2) ~ group_order[2]))
+                                                              (feature %in% nnsdf1) & !(feature %in% nnsdf2) ~ numerator,
+                                                              !(feature %in% nnsdf1) & (feature %in% nnsdf2) ~ denominator))
 
   # add an attribute specifying which group is the numerator and communicated this to user
-  attr(result, "numerator") <- group_order[1]
-  cat("NOTE: values refer to the ratio", paste0(group_order[1], "/", group_order[2], "."))
+  attr(result, "numerator") <- numerator
+  if(verbose) cat("NOTE: values refer to the ratio", paste0(numerator, "/", denominator, "."))
 
   return(result)
 }
 
-
 # sub-function
 nns_ratio_boostrap <- function(x,
                          groups,
-                         group_order,
+                         numerator = NULL,
                          candidates = character(0),
                          pre_trained = pre_trained,
                          transform = TRUE,
-                         transform_matrix = transform_matrix,
-                         what = what){
+                         transform_matrix = transform_matrix){
 
-  # create a new corpus
-  x <- quanteda::corpus_sample(x, size = quanteda::ndoc(x), replace = TRUE, by = groups)
 
-  # tokenize texts
-  corpus_toks <- quanteda::tokens(x, what = what)
+  # sample tokens with replacement
+  x <- quanteda::tokens_sample(x = x, size = table(groups), replace = TRUE, by = groups)
 
   # create document-feature matrix
-  corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
+  x_dfm <- quanteda::dfm(x, tolower = FALSE)
 
   # compute document-embedding matrix
-  corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
+  x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
 
-  # aggregate dems by group var
-  corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
-  corpus_dem <- corpus_dem[group_order,] # re-arrange groups
+  # aggregate dems by group
+  wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
 
   # find nearest neighbors
-  result <- nns_ratio(x = corpus_dem, N = NULL, candidates = candidates, pre_trained = pre_trained)
+  result <- nns_ratio(x = wvs, N = NULL, numerator = numerator, candidates = candidates, pre_trained = pre_trained, verbose = FALSE)
 
   return(result)
 
@@ -210,31 +201,26 @@ nns_ratio_boostrap <- function(x,
 # runs permutations
 nns_ratio_permute <- function(x,
                               groups,
-                              group_order,
+                              numerator = NULL,
                               candidates = character(0),
                               pre_trained,
                               transform = TRUE,
-                              transform_matrix,
-                              what = 'word'){
+                              transform_matrix){
 
-  # shuffle the texts
-  x <- quanteda::corpus(sample(as.character(x)), docvars = data.frame('group' = groups))
-
-  # tokenize texts
-  corpus_toks <- quanteda::tokens(x, what = what)
+  # shuffle tokenized texts
+  docvars(x, 'group') <- sample(groups)
 
   # create document-feature matrix
-  corpus_dfm <- quanteda::dfm(corpus_toks, tolower = FALSE)
+  x_dfm <- quanteda::dfm(x, tolower = FALSE)
 
   # compute document-embedding matrix
-  corpus_dem <- dem(x = corpus_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
+  x_dem <- dem(x = x_dfm, pre_trained = pre_trained, transform = transform, transform_matrix = transform_matrix, verbose = FALSE)
 
-  # aggregate dems by group var
-  corpus_dem <- dem_group(x = corpus_dem, groups = corpus_dem@docvars$group)
-  corpus_dem <- corpus_dem[group_order,] # re-arrange groups
+  # aggregate dems by group
+  wvs <- dem_group(x = x_dem, groups = x_dem@docvars$group)
 
   # find nearest neighbors
-  result <- nns_ratio(x = corpus_dem, N = NULL, candidates = candidates, pre_trained = pre_trained)
+  result <- nns_ratio(x = wvs, N = NULL, numerator = numerator, candidates = candidates, pre_trained = pre_trained, verbose = FALSE)
 
   return(result)
 }
diff --git a/R/nns.R b/R/nns.R
index 260a529..7467551 100644
--- a/R/nns.R
+++ b/R/nns.R
@@ -83,7 +83,8 @@ nns <- function(x, N = 10, candidates = character(0), pre_trained, as_list = TRU
   if(is.null(rownames(x))) result$target <- NA
 
   # if !as_list return a list object with an item for each target data.frame
-  if(as_list) result <- lapply(unique(result$target), function(i) result[result$target == i,] %>% dplyr::mutate(target = as.character(target))) %>% setNames(unique(result$target))
+  if(as_list && !is.null(rownames(x))) result <- lapply(unique(result$target), function(i) result[result$target == i,] %>% dplyr::mutate(target = as.character(target))) %>% setNames(unique(result$target))
+  if(as_list && is.null(rownames(x))) message("although as_list = TRUE, will return a single tibble as there is only one word vector")
 
   return(result)
 }
diff --git a/R/nns_ratio.R b/R/nns_ratio.R
index e4da72a..ed78a14 100644
--- a/R/nns_ratio.R
+++ b/R/nns_ratio.R
@@ -3,12 +3,14 @@
 #'
 #' @param x a [dem-class] or [fem-class] object or in general a matrix of embeddings
 #' @param N number of nearest neighbors to consider
+#' @param numerator (character) defines which group is the nuemerator in the ratio
 #' Note: if nearest neighbors overlap, the resulting number of features
 #' will be fewer than 2*N.
 #' @param candidates character vector delimiting the features to consider as nearest neighbor candidates
 #' @param pre_trained a F x D matrix of numeric values corresponding to pretrained embeddings
 #' F = number of features and D = embedding dimensions.
 #' rownames(pre_trained) = set of features for which there is a pre-trained embedding
+#' @param verbose provide information on which group is the numerator
 #'
 #' @return a `data.frame` with following columns:
 #'  \item{`feature`}{(character) vector of features from the candidate set,
@@ -46,12 +48,15 @@
 #' immig_dem_party <- dem_group(immig_dem, groups = immig_dem@docvars$party)
 #'
 #' # find nearest neighbors
-#' nns_ratio(x = immig_dem_party, pre_trained = glove_subset, candidates = character(0), N = 10)
-nns_ratio <- function(x, N = 10, candidates = character(0), pre_trained){
+#' nns_ratio(x = immig_dem_party, N = 10, numerator = "R", candidates = character(0), pre_trained = glove_subset, verbose = TRUE)
+nns_ratio <- function(x, N = 10, numerator = NULL, candidates = character(0), pre_trained, verbose = TRUE){
 
   # check
   if(nrow(x)!=2) stop('nns_ratio can only be applied to a pair of embeddings i.e. nrow(x) must equal 2')
 
+  # re-arrange if numerator is defined
+  if(!is.null(numerator) && rownames(x)[1]!=numerator) x <- x[c(rownames(x)[2],rownames(x)[1]),]
+
   # get nns
   nnsdf1 <- nns(x = x[1,], N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
   nnsdf2 <- nns(x = x[2,], N = Inf, candidates = candidates, pre_trained = pre_trained, as_list = FALSE)
@@ -64,5 +69,9 @@ nns_ratio <- function(x, N = 10, candidates = character(0), pre_trained){
   nnsdf2 <- nnsdf2 %>% dplyr::filter(feature %in% union_nns) %>% dplyr::arrange(feature)
   result <- data.frame(feature = nnsdf1$feature, value = nnsdf1$value/nnsdf2$value) %>% dplyr::arrange(-value)
 
+  # add an attribute specifying which group is the numerator and communicated this to user
+  attr(result, "numerator") <- rownames(x)[1]
+  if(verbose) cat("NOTE: values refer to the ratio", paste0(rownames(x)[1], "/", rownames(x)[2], "."))
+
   return(result)
 }
diff --git a/man/get_cos_sim.Rd b/man/get_cos_sim.Rd
index 225db9c..ba1e1b8 100644
--- a/man/get_cos_sim.Rd
+++ b/man/get_cos_sim.Rd
@@ -14,13 +14,12 @@ get_cos_sim(
   transform_matrix,
   bootstrap = TRUE,
   num_bootstraps = 10,
-  what = "word",
   as_list = TRUE,
   verbose = TRUE
 )
 }
 \arguments{
-\item{x}{a (quanteda) corpus or character vector}
+\item{x}{a (quanteda) tokens object}
 
 \item{groups}{a grouping variable}
 
@@ -42,9 +41,6 @@ the highest average over all bootstrap samples.}
 
 \item{num_bootstraps}{(integer) number of bootstraps to use}
 
-\item{what}{character; which quanteda tokenizer to use. You will rarely want to change this.
-For Chinese texts you may want to set what = 'fastestword'.}
-
 \item{as_list}{(logical) if FALSE all results are combined into a single data.frame
 If TRUE, a list of data.frames is returned with one data.frame per feature.}
 
@@ -74,8 +70,10 @@ library(quanteda)
 immig_corpus <- corpus_context(x = cr_sample_corpus,
 pattern = "immigration", window = 6L, verbose = TRUE)
 
-get_cos_sim(x = immig_corpus,
-groups = docvars(immig_corpus, 'party'),
+immig_toks <- tokens(immig_corpus)
+
+get_cos_sim(x = immig_toks,
+groups = docvars(immig_toks, "party"),
 features = c("reform", "enforce"),
 pre_trained = glove_subset,
 transform = TRUE,
diff --git a/man/get_nns.Rd b/man/get_nns.Rd
index e15bdff..a70ea7e 100644
--- a/man/get_nns.Rd
+++ b/man/get_nns.Rd
@@ -15,12 +15,11 @@ get_nns(
   transform_matrix,
   bootstrap = TRUE,
   num_bootstraps = 10,
-  what = "word",
   as_list = TRUE
 )
 }
 \arguments{
-\item{x}{a (quanteda) corpus or character vector}
+\item{x}{a (quanteda) tokens object}
 
 \item{N}{(numeric) number of nearest neighbors to return}
 
@@ -47,9 +46,6 @@ the highest average over all bootstrap samples.}
 
 \item{num_bootstraps}{(integer) number of bootstraps to use}
 
-\item{what}{character; which quanteda tokenizer to use. You will rarely want to change this.
-For Chinese texts you may want to set what = 'fastestword'.}
-
 \item{as_list}{(logical) if FALSE all results are combined into a single data.frame
 If TRUE, a list of data.frames is returned with one data.frame per target.}
 }
@@ -76,8 +72,10 @@ library(quanteda)
 immig_corpus <- corpus_context(x = cr_sample_corpus,
 pattern = "immigration", window = 6L, verbose = TRUE)
 
-get_nns(x = immig_corpus, N = 10,
-groups = docvars(immig_corpus, 'party'),
+immig_toks <- tokens(immig_corpus)
+
+get_nns(x = immig_toks, N = 10,
+groups = docvars(immig_toks, "party"),
 candidates = character(0),
 pre_trained = glove_subset,
 transform = TRUE,
diff --git a/man/get_nns_ratio.Rd b/man/get_nns_ratio.Rd
index a1ff55f..1234a81 100644
--- a/man/get_nns_ratio.Rd
+++ b/man/get_nns_ratio.Rd
@@ -8,7 +8,7 @@ over the union of their respective N nearest neighbors.}
 get_nns_ratio(
   x,
   N = 10,
-  groups = NULL,
+  groups,
   numerator = NULL,
   candidates = character(0),
   pre_trained,
@@ -18,19 +18,17 @@ get_nns_ratio(
   num_bootstraps = 10,
   permute = TRUE,
   num_permutations = 100,
-  what = "word"
+  verbose = TRUE
 )
 }
 \arguments{
-\item{x}{a (quanteda) corpus or character vector}
+\item{x}{a (quanteda) tokens object}
 
-\item{N}{number of nearest neighbors to consider
-Note: if nearest neighbors overlap, the resulting number of features
-will be fewer than 2*N.}
+\item{N}{number of nearest neighbors to consider}
 
 \item{groups}{a character or factor variable equal in length to the number of documents}
 
-\item{numerator}{(character)}
+\item{numerator}{(character) defines which group is the nuemerator in the ratio.}
 
 \item{candidates}{character vector delimiting the features to consider as nearest neighbor candidates}
 
@@ -54,8 +52,7 @@ the highest average over all bootstrap samples.}
 
 \item{num_permutations}{(numeric) number of permutations to use}
 
-\item{what}{character; which quanteda tokenizer to use. You will rarely want to change this.
-For Chinese texts you may want to set what = 'fastestword'.}
+\item{verbose}{provide information on which group is the numerator}
 }
 \value{
 a \code{data.frame} with following columns:
@@ -82,12 +79,17 @@ pattern = "immigration",
 window = 6L,
 verbose = TRUE)
 
+# identify features in local vocab that overlap with pre-trained embeddings
+# these are used to define candidate nearest neighbors below (optional)
 local_vocab <- get_local_vocab(as.character(immig_corpus), pre_trained = glove_subset)
 
+# tokenize
+immig_toks <- tokens(immig_corpus)
+
 set.seed(42L)
-get_nns_ratio(x = immig_corpus,
+temp <- get_nns_ratio(x = immig_toks,
 N = 20,
-groups = docvars(immig_corpus, 'party'),
+groups = docvars(immig_toks, 'party'),
 numerator = "R",
 candidates = local_vocab,
 pre_trained = glove_subset,
@@ -96,7 +98,7 @@ transform_matrix = khodakA,
 bootstrap = TRUE,
 num_bootstraps = 10,
 permute = TRUE,
-num_permutations = 10)
-
+num_permutations = 10,
+verbose = TRUE)
 }
 \keyword{get_nns_ratio}
diff --git a/man/nns_ratio.Rd b/man/nns_ratio.Rd
index 263cdee..a5f74df 100644
--- a/man/nns_ratio.Rd
+++ b/man/nns_ratio.Rd
@@ -5,12 +5,21 @@
 \title{Computes the ratio of cosine similarities for two embeddings over
 the union of their respective top N nearest neighbors.}
 \usage{
-nns_ratio(x, N = 10, candidates = character(0), pre_trained)
+nns_ratio(
+  x,
+  N = 10,
+  numerator = NULL,
+  candidates = character(0),
+  pre_trained,
+  verbose = TRUE
+)
 }
 \arguments{
 \item{x}{a \linkS4class{dem} or \linkS4class{fem} object or in general a matrix of embeddings}
 
-\item{N}{number of nearest neighbors to consider
+\item{N}{number of nearest neighbors to consider}
+
+\item{numerator}{(character) defines which group is the nuemerator in the ratio
 Note: if nearest neighbors overlap, the resulting number of features
 will be fewer than 2*N.}
 
@@ -19,6 +28,8 @@ will be fewer than 2*N.}
 \item{pre_trained}{a F x D matrix of numeric values corresponding to pretrained embeddings
 F = number of features and D = embedding dimensions.
 rownames(pre_trained) = set of features for which there is a pre-trained embedding}
+
+\item{verbose}{provide information on which group is the numerator}
 }
 \value{
 a \code{data.frame} with following columns:
@@ -58,6 +69,6 @@ verbose = FALSE)
 immig_dem_party <- dem_group(immig_dem, groups = immig_dem@docvars$party)
 
 # find nearest neighbors
-nns_ratio(x = immig_dem_party, pre_trained = glove_subset, candidates = character(0), N = 10)
+nns_ratio(x = immig_dem_party, N = 10, numerator = "R", candidates = character(0), pre_trained = glove_subset, verbose = TRUE)
 }
 \keyword{nns_ratio}