Skip to content

Commit

Permalink
add embedding scripts for cohere, together and voyage
Browse files Browse the repository at this point in the history
  • Loading branch information
enjalot committed Jan 17, 2024
1 parent 2face9e commit 2052146
Show file tree
Hide file tree
Showing 5 changed files with 277 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
OPENAI_API_KEY=XXX
VOYAGE_API_KEY=XXX
TOGETHER_API_KEY=XXX
TOGETHER_API_KEY=XXX
COHERE_API_KEY=XXX
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ anyio==4.2.0
appnope==0.1.3
asttokens==2.4.1
attrs==23.2.0
backoff==2.2.1
blinker==1.7.0
certifi==2023.11.17
charset-normalizer==3.3.2
click==8.1.7
cohere==4.44
comm==0.2.1
contourpy==1.2.0
cycler==0.12.1
Expand All @@ -19,6 +21,7 @@ decorator==5.1.1
distro==1.9.0
einops==0.7.0
executing==2.0.1
fastavro==1.9.3
filelock==3.13.1
Flask==3.0.0
Flask-Cors==4.0.0
Expand All @@ -31,6 +34,7 @@ httpcore==1.0.2
httpx==0.26.0
huggingface-hub==0.20.2
idna==3.6
importlib-metadata==6.11.0
ipykernel==6.29.0
ipython==8.20.0
itsdangerous==2.1.2
Expand Down Expand Up @@ -80,17 +84,21 @@ scikit-learn==1.3.2
scipy==1.11.4
six==1.16.0
sniffio==1.3.0
sseclient-py==1.8.0
stack-data==0.6.3
sympy==1.12
tabulate==0.9.0
tenacity==8.2.3
threadpoolctl==3.2.0
tiktoken==0.5.2
together==0.2.10
tokenizers==0.15.0
torch==2.1.2
tornado==6.4
tqdm==4.66.1
traitlets==5.14.1
transformers==4.36.2
typer==0.9.0
typing_extensions==4.9.0
tzdata==2023.4
umap-learn==0.5.5
Expand All @@ -99,3 +107,4 @@ voyageai==0.1.6
wcwidth==0.2.13
Werkzeug==3.0.1
yarl==1.9.4
zipp==3.17.0
84 changes: 84 additions & 0 deletions scripts/embed-cohereai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Usage: python embed-openai.py <dataset_name> <text_column>
import os
import json
import time
import cohere
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel

load_dotenv()

def chunked_iterable(iterable, size):
"""Yield successive chunks from an iterable."""
for i in range(0, len(iterable), size):
yield iterable[i:i + size]


def embedder(dataset_name, text_column="text", model_name="embed-english-v3.0"):
# TODO: have lookup table for truncate lengths

df = pd.read_parquet(f"../data/{dataset_name}/input.parquet")
# Sentences we want sentence embeddings for
sentences = df[text_column].tolist()
print("embedding", len(sentences), "sentences")

batch_size = 100
sentence_embeddings = []

rate_limit = 60 # number of requests per minute
start_time = time.time()
request_count = 0
client = cohere.Client(os.getenv("COHERE_API_KEY"))

for batch in tqdm(chunked_iterable(sentences, batch_size), total=len(sentences)//batch_size):
# inputs = [b.replace("\n", " ") for b in batch]
response = client.embed(texts=batch, model=model_name, input_type="clustering")
embeddings = response.embeddings
sentence_embeddings.extend(embeddings)

time.sleep(0.01)
# Rate limit the requests
request_count += 1
if request_count >= rate_limit:
elapsed_time = time.time() - start_time
if elapsed_time < 60:
time.sleep(60 - elapsed_time)
start_time = time.time()
request_count = 0

print("sentence embeddings:", len(sentence_embeddings))
# Convert sentence_embeddings to numpy
np_embeds = np.array(sentence_embeddings)
print("sentence embeddings:", np_embeds.shape)


# Save embeddings as a numpy file
if not os.path.exists(f'../data/{dataset_name}/embeddings'):
os.makedirs(f'../data/{dataset_name}/embeddings')

# TODO: make the sanitization a function
safe_model_name = "cohereai-" + model_name.replace("/", "___")

np.save(f'../data/{dataset_name}/embeddings/{safe_model_name}.npy', np_embeds)
# write out a json file with the model name and shape of the embeddings
with open(f'../data/{dataset_name}/meta.json', 'w') as f:
json.dump({
"id": dataset_name,
"text_column": text_column,
"length": len(sentences),
}, f, indent=2)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Embed a dataset using OpenAI')
parser.add_argument('name', type=str, help='Dataset name (directory name in data/)')
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('model', type=str, help='Name of Transformer Embedding model to use', default="embed-english-v3.0")

# Parse arguments
args = parser.parse_args()

embedder(args.name, args.text_column, args.model)
98 changes: 98 additions & 0 deletions scripts/embed-togetherai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Usage: python embed-openai.py <dataset_name> <text_column>
import os
import json
import time
import tiktoken
import argparse
import together
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel

load_dotenv()



def chunked_iterable(iterable, size):
"""Yield successive chunks from an iterable."""
for i in range(0, len(iterable), size):
yield iterable[i:i + size]


def embedder(dataset_name, text_column="text", model_name="WhereIsAI/UAE-Large-V1"):
# TODO: have lookup table for truncate lengths
# https://docs.together.ai/docs/embedding-models
truncate = 512
# TODO: figure out encoder for each model
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

df = pd.read_parquet(f"../data/{dataset_name}/input.parquet")
# Sentences we want sentence embeddings for
sentences = df[text_column].tolist()
print("embedding", len(sentences), "sentences")

batch_size = 100
sentence_embeddings = []

rate_limit = 60 # number of requests per minute
start_time = time.time()
request_count = 0
together.api_key = os.getenv("TOGETHER_API_KEY")
client = together.Together()

for batch in tqdm(chunked_iterable(sentences, batch_size), total=len(sentences)//batch_size):
# Ensure the text b doesn't exceed the token length of 8192 tokens
inputs = [b.replace("\n", " ") for b in batch]
inputs = [enc.decode(enc.encode(b)[:truncate]) if len(enc.encode(b)) > truncate else b for b in batch]

print("sending", len(inputs), "sentences to together.ai")
response = client.embeddings.create(
input=inputs,
model=model_name
)
embeddings = [response.data[i].embedding for i in range(len(inputs))]
sentence_embeddings.extend(embeddings)

time.sleep(0.01)
# Rate limit the requests
request_count += 1
if request_count >= rate_limit:
elapsed_time = time.time() - start_time
if elapsed_time < 60:
time.sleep(60 - elapsed_time)
start_time = time.time()
request_count = 0

print("sentence embeddings:", len(sentence_embeddings))
# Convert sentence_embeddings to numpy
np_embeds = np.array(sentence_embeddings)
print("sentence embeddings:", np_embeds.shape)


# Save embeddings as a numpy file
if not os.path.exists(f'../data/{dataset_name}/embeddings'):
os.makedirs(f'../data/{dataset_name}/embeddings')

# TODO: make the sanitization a function
safe_model_name = "together-" + model_name.replace("/", "___")
np.save(f'../data/{dataset_name}/embeddings/{safe_model_name}.npy', np_embeds)
# write out a json file with the model name and shape of the embeddings
with open(f'../data/{dataset_name}/meta.json', 'w') as f:
json.dump({
"id": dataset_name,
"text_column": text_column,
"length": len(sentences),
}, f, indent=2)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Embed a dataset using OpenAI')
parser.add_argument('name', type=str, help='Dataset name (directory name in data/)')
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('model', type=str, help='Name of Transformer Embedding model to use', default="WhereIsAI/UAE-Large-V1")

# Parse arguments
args = parser.parse_args()

embedder(args.name, args.text_column, args.model)
84 changes: 84 additions & 0 deletions scripts/embed-voyageai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Usage: python embed-openai.py <dataset_name> <text_column>
import os
import json
import time
import argparse
import voyageai
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel

load_dotenv()

def chunked_iterable(iterable, size):
"""Yield successive chunks from an iterable."""
for i in range(0, len(iterable), size):
yield iterable[i:i + size]


def embedder(dataset_name, text_column="text", model_name="voyage-02"):
# TODO: have lookup table for truncate lengths

df = pd.read_parquet(f"../data/{dataset_name}/input.parquet")
# Sentences we want sentence embeddings for
sentences = df[text_column].tolist()
print("embedding", len(sentences), "sentences")

batch_size = 100
sentence_embeddings = []

rate_limit = 60 # number of requests per minute
start_time = time.time()
request_count = 0
voyageai.api_key = os.getenv("VOYAGE_API_KEY")
client = voyageai.Client()

for batch in tqdm(chunked_iterable(sentences, batch_size), total=len(sentences)//batch_size):
# inputs = [b.replace("\n", " ") for b in batch]
response = client.embed(batch, model=model_name, truncation=True)
embeddings = response.embeddings
sentence_embeddings.extend(embeddings)

time.sleep(0.1)
# Rate limit the requests
request_count += 1
if request_count >= rate_limit:
elapsed_time = time.time() - start_time
if elapsed_time < 60:
time.sleep(60 - elapsed_time)
start_time = time.time()
request_count = 0

print("sentence embeddings:", len(sentence_embeddings))
# Convert sentence_embeddings to numpy
np_embeds = np.array(sentence_embeddings)
print("sentence embeddings:", np_embeds.shape)


# Save embeddings as a numpy file
if not os.path.exists(f'../data/{dataset_name}/embeddings'):
os.makedirs(f'../data/{dataset_name}/embeddings')

# TODO: make the sanitization a function
safe_model_name = "voyageai-" + model_name.replace("/", "___")
np.save(f'../data/{dataset_name}/embeddings/{safe_model_name}.npy', np_embeds)
# write out a json file with the model name and shape of the embeddings
with open(f'../data/{dataset_name}/meta.json', 'w') as f:
json.dump({
"id": dataset_name,
"text_column": text_column,
"length": len(sentences),
}, f, indent=2)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Embed a dataset using OpenAI')
parser.add_argument('name', type=str, help='Dataset name (directory name in data/)')
parser.add_argument('text_column', type=str, help='Output file', default='text')
parser.add_argument('model', type=str, help='Name of Transformer Embedding model to use', default="voyage-02")

# Parse arguments
args = parser.parse_args()

embedder(args.name, args.text_column, args.model)

0 comments on commit 2052146

Please sign in to comment.