Skip to content

Commit

Permalink
support matroyshka embeddings in nomic 1.5 and openai-small-3 and ope…
Browse files Browse the repository at this point in the history
…nai-large-3
  • Loading branch information
enjalot committed Feb 28, 2024
1 parent c010ccc commit ae1375f
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 24 deletions.
21 changes: 19 additions & 2 deletions latentscope/models/embedding_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,26 @@
"id": "transformers-nomic-ai___nomic-embed-text-v1",
"modality": "text",
"params": {
"max_tokens": 8192,
"truncation": true,
"padding": true,
"pooling": "mean"
}
},
{
"provider": "transformers",
"name": "nomic-ai/nomic-embed-text-v1.5",
"sanitized_name": "nomic-ai___nomic-embed-text-v1.5",
"id": "transformers-nomic-ai___nomic-embed-text-v1.5",
"modality": "text",
"params": {
"max_tokens": 8192,
"truncation": true,
"padding": true,
"pooling": "mean",
"dimensions": [768, 512, 256, 128, 64]
}
},
{
"provider": "transformers",
"name": "BAAI/bge-small-en-v1.5",
Expand Down Expand Up @@ -137,15 +152,17 @@
"name": "text-embedding-3-small",
"id": "openai-text-embedding-3-small",
"params": {
"max_tokens": 8192
"max_tokens": 8192,
"dimensions": [1536, 768, 512, 256, 128, 64]
}
},
{
"provider": "openai",
"name": "text-embedding-3-large",
"id": "openai-text-embedding-3-large",
"params": {
"max_tokens": 8192
"max_tokens": 8192,
"dimensions": [3072, 1536, 768, 512, 256, 128, 64]
}
},
{
Expand Down
3 changes: 2 additions & 1 deletion latentscope/models/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def load_model(self):
else:
self.encoder = tiktoken.encoding_for_model(self.name)

def embed(self, inputs):
def embed(self, inputs, dimensions=None):
time.sleep(0.01) # TODO proper rate limiting
enc = self.encoder
max_tokens = self.params["max_tokens"]
Expand All @@ -28,6 +28,7 @@ def embed(self, inputs):
response = self.client.embeddings.create(
input=inputs,
model=self.name,
dimensions=dimensions
)
embeddings = [embedding.embedding for embedding in response.data]
return embeddings
Expand Down
18 changes: 12 additions & 6 deletions latentscope/models/providers/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@ def mean_pooling(model_output, attention_mask):

class TransformersEmbedProvider(EmbedModelProvider):
def load_model(self):
if self.name == "nomic-ai/nomic-embed-text-v1":
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
if self.name == "nomic-ai/nomic-embed-text-v1" or self.name == "nomic-ai/nomic-embed-text-v1.5":
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length=self.params["max_tokens"])
self.model = AutoModel.from_pretrained("nomic-ai/nomic-embed-text-v1", trust_remote_code=True, rotary_scaling_factor=2 )
else:
self.tokenizer = AutoTokenizer.from_pretrained(self.name)
self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True)
self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True)
self.model.eval()

def embed(self, inputs):
def embed(self, inputs, dimensions=None):
encoded_input = self.tokenizer(inputs, padding=self.params["padding"], truncation=self.params["truncation"], return_tensors='pt')
pool = self.params["pooling"]
# Compute token embeddings
Expand All @@ -37,9 +38,14 @@ def embed(self, inputs):
elif pool == "mean":
embeddings = mean_pooling(model_output, encoded_input["attention_mask"])

# Support Matroyshka embeddings
if dimensions is not None:
embeddings = torch.nn.functional.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
embeddings = embeddings[:, :dimensions]

# Normalize embeddings
normalized_embeedings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return normalized_embeedings.tolist()
normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return normalized_embeddings.tolist()


class TransformersChatProvider(ChatModelProvider):
Expand Down
9 changes: 5 additions & 4 deletions latentscope/scripts/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,15 @@ def main():
parser.add_argument('dataset_id', type=str, help='Dataset id (directory name in data/)')
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('model_id', type=str, help='ID of embedding model to use', default="transformers-BAAI___bge-small-en-v1.5")
parser.add_argument('prefix', type=str, help='Prefix to prepend to text before embedding', default="")
parser.add_argument('--prefix', type=str, help='Prefix to prepend to text before embedding', default="")
parser.add_argument('--dimensions', type=int, help='Truncate embeddings to dimensions a la Matroyshka embeddings', default=None)
parser.add_argument('--rerun', type=str, help='Rerun the given embedding from last completed batch')

# Parse arguments
args = parser.parse_args()
embed(args.dataset_id, args.text_column, args.model_id, args.prefix, args.rerun)
embed(args.dataset_id, args.text_column, args.model_id, args.prefix, args.rerun, args.dimensions)

def embed(dataset_id, text_column, model_id, prefix, rerun):
def embed(dataset_id, text_column, model_id, prefix, rerun, dimensions):
DATA_DIR = get_data_dir()
df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "input.parquet"))

Expand Down Expand Up @@ -115,7 +116,7 @@ def embed(dataset_id, text_column, model_id, prefix, rerun):
print(f"skipping batch {i}/{total_batches}", flush=True)
continue
try:
embeddings = np.array(model.embed(batch))
embeddings = np.array(model.embed(batch, dimensions=dimensions))
append_to_hdf5(os.path.join(embedding_dir, f"{embedding_id}.h5"), embeddings)
except Exception as e:
print(batch)
Expand Down
8 changes: 4 additions & 4 deletions latentscope/server/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ def get_datasets():
"""
def scan_for_json_files(directory_path, match_pattern=r".*\.json$"):
try:
files = os.listdir(directory_path)
# files = os.listdir(directory_path)
files = sorted(os.listdir(directory_path), key=lambda x: os.path.getmtime(os.path.join(directory_path, x)), reverse=True)
except OSError as err:
print('Unable to scan directory:', err)
return jsonify({"error": "Unable to scan directory"}), 500

json_files = [file for file in files if re.match(match_pattern, file)]
json_files.sort()
print("files", files)
print("json", json_files)
# print("files", files)
# print("json", json_files)

json_contents = []
for file in json_files:
Expand Down
3 changes: 2 additions & 1 deletion latentscope/server/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,10 @@ def run_embed():
text_column = request.args.get('text_column')
model_id = request.args.get('model_id') # model id
prefix = request.args.get('prefix')
dimensions = request.args.get('dimensions')

job_id = str(uuid.uuid4())
command = f'ls-embed {dataset} {text_column} {model_id} "{prefix}"'
command = f'ls-embed {dataset} {text_column} {model_id} --prefix="{prefix}" --dimensions={dimensions}'
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ notebook==7.0.7
notebook_shim==0.2.3
numba==0.58.1
numpy==1.26.3
openai==1.8.0
openai==1.12.0
opt-einsum==3.3.0
orjson==3.9.12
overrides==7.7.0
Expand Down
31 changes: 26 additions & 5 deletions web/src/components/Setup/Embedding.jsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// NewEmbedding.jsx
import { useState, useEffect, useMemo } from 'react';
import { useState, useEffect, useCallback } from 'react';
import JobProgress from '../Job/Progress';
import { useStartJobPolling } from '../Job/Run';
const apiUrl = import.meta.env.VITE_API_URL
Expand Down Expand Up @@ -32,10 +32,17 @@ function EmbeddingNew({ dataset, textColumn, embedding, umaps, clusters, onNew,
useEffect(() => {
fetch(`${apiUrl}/embedding_models`)
.then(response => response.json())
.then(setModels)
.then((data) => {
setModels(data)
setModel(data[0])
})
.catch(console.error);
}, []);

const [model, setModel] = useState(null);
// for the models that support choosing the size of dimensions
const [dimensions, setDimensions] = useState(null)

const fetchEmbeddings = (datasetId, callback) => {
fetch(`${apiUrl}/datasets/${datasetId}/embeddings`)
.then(response => response.json())
Expand Down Expand Up @@ -66,7 +73,7 @@ function EmbeddingNew({ dataset, textColumn, embedding, umaps, clusters, onNew,
}
}, [embeddingsJob, dataset, setEmbeddings, onNew])

const handleNewEmbedding = (e) => {
const handleNewEmbedding = useCallback((e) => {
e.preventDefault();
const form = e.target;
const data = new FormData(form);
Expand All @@ -77,25 +84,39 @@ function EmbeddingNew({ dataset, textColumn, embedding, umaps, clusters, onNew,
model_id: model.id,
prefix
};
if(dimensions) job.dimensions = dimensions
startEmbeddingsJob(job);
};
}, [startEmbeddingsJob, textColumn, models, dimensions]);

const handleRerunEmbedding = (job) => {
rerunEmbeddingsJob({job_id: job?.id});
}

const handleModelChange = (e) => {
const model = models.find(model => model.id === e.target.value);
setModel(model)
}
const handleDimensionsChange = (e) => {
setDimensions(+e.target.value)
}

return (
<div>
<div className={styles["embeddings-form"]}>
Embedding on column: <b>{textColumn}</b>
<form onSubmit={handleNewEmbedding}>
<label htmlFor="modelName">Model:
<select id="modelName" name="modelName" disabled={!!embeddingsJob}>
<select id="modelName" name="modelName" disabled={!!embeddingsJob} onChange={handleModelChange}>
{models.map((model, index) => (
<option key={index} value={model.id}>{model.provider}: {model.name}</option>
))}
</select></label>
<textarea name="prefix" placeholder={`Optional prefix to prepend to each ${textColumn}`} disabled={!!embeddingsJob}></textarea>
{model && model.params.dimensions ? <select onChange={handleDimensionsChange}>
{model.params.dimensions.map((dim, index) => {
return <option key={index} value={dim}>{dim}</option>
})}
</select> : null}
<button type="submit" disabled={!!embeddingsJob}>New Embedding</button>
</form>
</div>
Expand Down

0 comments on commit ae1375f

Please sign in to comment.