From 179387986649ad688fa3d60ad72f877a7f017136 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Fri, 12 Jul 2024 12:22:49 -0400 Subject: [PATCH] move to sentence_transformers for embedding script --- latentscope/models/providers/transformers.py | 65 +++++++++++--------- requirements.txt | 3 +- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/latentscope/models/providers/transformers.py b/latentscope/models/providers/transformers.py index e075a95..7f73a1f 100644 --- a/latentscope/models/providers/transformers.py +++ b/latentscope/models/providers/transformers.py @@ -7,50 +7,55 @@ def __init__(self, name, params): self.torch = torch self.device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu") - def cls_pooling(self, model_output): - return model_output[0][:, 0] + # def cls_pooling(self, model_output): + # return model_output[0][:, 0] - def average_pooling(self, model_output, attention_mask): - last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) - return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] + # def average_pooling(self, model_output, attention_mask): + # last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) + # return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] - def mean_pooling(self, model_output, attention_mask): - token_embeddings = model_output[0] - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - return self.torch.sum(token_embeddings * input_mask_expanded, 1) / self.torch.clamp(input_mask_expanded.sum(1), min=1e-9) + # def mean_pooling(self, model_output, attention_mask): + # token_embeddings = model_output[0] + # input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + # return self.torch.sum(token_embeddings * input_mask_expanded, 1) / self.torch.clamp(input_mask_expanded.sum(1), min=1e-9) def load_model(self): - from transformers import AutoTokenizer, AutoModel + # from transformers import AutoTokenizer, AutoModel + from sentence_transformers import SentenceTransformer - if "rps" in self.params and self.params["rps"]: - self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True, safe_serialization=True, rotary_scaling_factor=2 ) - else: - self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True) + self.model = SentenceTransformer(self.name, trust_remote_code=True) + self.tokenizer = self.model.tokenizer - print("CONFIG", self.model.config) + # if "rps" in self.params and self.params["rps"]: + # self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True, safe_serialization=True, rotary_scaling_factor=2 ) + # else: + # self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True) - if self.name == "nomic-ai/nomic-embed-text-v1" or self.name == "nomic-ai/nomic-embed-text-v1.5": - self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=self.params["max_tokens"]) - else: - self.tokenizer = AutoTokenizer.from_pretrained(self.name) + # print("CONFIG", self.model.config) + + # if self.name == "nomic-ai/nomic-embed-text-v1" or self.name == "nomic-ai/nomic-embed-text-v1.5": + # self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=self.params["max_tokens"]) + # else: + # self.tokenizer = AutoTokenizer.from_pretrained(self.name) self.model.to(self.device) self.model.eval() def embed(self, inputs, dimensions=None): - encoded_input = self.tokenizer(inputs, padding=self.params["padding"], truncation=self.params["truncation"], return_tensors='pt') - encoded_input = {key: value.to(self.device) for key, value in encoded_input.items()} - pool = self.params["pooling"] + # encoded_input = self.tokenizer(inputs, padding=self.params["padding"], truncation=self.params["truncation"], return_tensors='pt') + # encoded_input = {key: value.to(self.device) for key, value in encoded_input.items()} + # pool = self.params["pooling"] # Compute token embeddings - with self.torch.no_grad(): - model_output = self.model(**encoded_input) - if pool == "cls": - embeddings = self.cls_pooling(model_output) - elif pool == "average": - embeddings = self.average_pooling(model_output, encoded_input["attention_mask"]) - elif pool == "mean": - embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"]) + # with self.torch.no_grad(): + # model_output = self.model(**encoded_input) + # if pool == "cls": + # embeddings = self.cls_pooling(model_output) + # elif pool == "average": + # embeddings = self.average_pooling(model_output, encoded_input["attention_mask"]) + # elif pool == "mean": + # embeddings = self.mean_pooling(model_output, encoded_input["attention_mask"]) + embeddings = self.model.encode(inputs, convert_to_tensor=True) # Support Matroyshka embeddings if dimensions is not None and dimensions > 0: embeddings = self.torch.nn.functional.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],)) diff --git a/requirements.txt b/requirements.txt index 8f22479..ebe44c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -133,6 +133,7 @@ safetensors~=0.4.1 scikit-learn~=1.3.2 scipy~=1.11.4 Send2Trash~=1.8.2 +sentence-transformers~=3.0.1 six~=1.16.0 sniffio~=1.3.0 soupsieve~=2.5 @@ -159,7 +160,7 @@ tzdata~=2023.4 umap-learn~=0.5.5 uri-template~=1.3.0 urllib3~=2.1.0 -voyageai~=0.1.6 +voyageai~=0.2.3 wcwidth~=0.2.13 webcolors~=1.13 webencodings~=0.5.1