-
Notifications
You must be signed in to change notification settings - Fork 19
/
get_local_vocab.R
29 lines (25 loc) · 1.13 KB
/
get_local_vocab.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#' Identify words common to a collection of texts and a set of pretrained embeddings.
#'
#' Local vocab consists of the intersect between the set of pretrained embeddings
#' and the collection of texts.
#'
#' @param context (character) vector of contexts (usually `context` in `get_context()` output)
#' @param pre_trained (numeric) a F x D matrix corresponding to pretrained embeddings.
#' F = number of features and D = embedding dimensions.
#' rownames(pre_trained) = set of features for which there is a pre-trained embedding.
#'
#' @return (character) vector of words common to the texts and pretrained embeddings.
#'
#' @export
#' @rdname get_local_vocab
#' @keywords get_local_vocab
#' @examples
#' # find local vocab (use it to define the candidate of nearest neighbors)
#' local_vocab <- get_local_vocab(cr_sample_corpus, pre_trained = cr_glove_subset)
get_local_vocab <- function(context, pre_trained){
# build context term-feature matrix
context_tfm <- quanteda::dfm(quanteda::tokens(context))
# common vocab between pretrained and contexts
local_vocab <- intersect(colnames(context_tfm), rownames(pre_trained))
return(local_vocab)
}