Skip to content

Commit

Permalink
implement cluster labeling interface, similar system for supporting c…
Browse files Browse the repository at this point in the history
…hat and embed models
  • Loading branch information
enjalot committed Jan 23, 2024
1 parent c39e64c commit a7b92e8
Show file tree
Hide file tree
Showing 18 changed files with 192 additions and 174 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ npm run dev
Now you can open your browser to the provided url and use Latent Scope!

## Embedding models
The scripts below (which power the app) reference embedding models by an "id" which identifies models prepared in [models/models.json](models/models.json)
The scripts below (which power the app) reference embedding models by an "id" which identifies models prepared in [models/embedding_models.json](models/embedding_models.json)

There is a `get_model(id)` function which will load the appropriate class based on the model provider. See `providers/` for `transformers`, `openai`, `cohereai`, `togetherai`, `voyageai`

Expand All @@ -57,7 +57,7 @@ python ingest.py database-curated
```

### 1. embed.py
Take the text from the input and embed it. Default is to use `BAAI/bge-small-en-v1.5` locally via HuggingFace transformers. API services are supported as well, see [models/models.json](models/models.json) for model ids.
Take the text from the input and embed it. Default is to use `BAAI/bge-small-en-v1.5` locally via HuggingFace transformers. API services are supported as well, see [models/embedding_models.json](models/embedding_models.json) for model ids.

```bash
# python embed.py <dataset_name> <text_column> <model_id>
Expand Down
2 changes: 1 addition & 1 deletion client/src/components/DatasetExplore.css
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,6 @@
position: fixed;
right: 0;
top: 40px;
width: 100px;
width: 350px;
height: 100%;
}
8 changes: 6 additions & 2 deletions client/src/components/DatasetSetup.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ function DatasetSetup() {
// get the list of available models
const [models, setModels] = useState([]);
useEffect(() => {
fetch(`http://localhost:5001/models`)
fetch(`http://localhost:5001/embedding_models`)
.then(response => response.json())
.then(data => {
// console.log("models", data)
Expand Down Expand Up @@ -181,8 +181,12 @@ function DatasetSetup() {
fetch(`http://localhost:5001/datasets/${datasetId}/clusters/${cluster.cluster_name}/labels`)
.then(response => response.json())
.then(data => {
console.log("update labels", cluster, data)
setClusterLabels(data)
});
}).catch(err => {
console.log(err)
setClusterLabels([])
})
} else {
setClusterLabels([])
}
Expand Down
46 changes: 29 additions & 17 deletions models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,43 @@
import os
import json
from .providers.transformers import TransformersProvider
from .providers.openai import OpenAIProvider
from .providers.cohereai import CohereAIProvider
from .providers.togetherai import TogetherAIProvider
from .providers.voyageai import VoyageAIProvider
from .providers.transformers import TransformersEmbedProvider, TransformersChatProvider
from .providers.openai import OpenAIEmbedProvider, OpenAIChatProvider
from .providers.cohereai import CohereAIEmbedProvider
from .providers.togetherai import TogetherAIEmbedProvider
from .providers.voyageai import VoyageAIEmbedProvider

models_path = os.path.join(os.path.dirname(__file__), "models.json")
with open(models_path, "r") as f:
model_list = json.load(f)

model_dict = {model['id']: model for model in model_list}
embed_models_path = os.path.join(os.path.dirname(__file__), "embedding_models.json")
with open(embed_models_path, "r") as f:
embed_model_list = json.load(f)
embed_model_dict = {model['id']: model for model in embed_model_list}


def get_model(id):
def get_embedding_model(id):
"""Returns a ModelProvider instance for the given model id."""
model = model_dict[id]
model = embed_model_dict[id]
if not model:
raise ValueError(f"Model {id} not found")

if model['provider'] == "transformers":
return TransformersProvider(model['name'], model['params'])
return TransformersEmbedProvider(model['name'], model['params'])
if model['provider'] == "openai":
return OpenAIProvider(model['name'], model['params'])
return OpenAIEmbedProvider(model['name'], model['params'])
if model['provider'] == "cohereai":
return CohereAIProvider(model['name'], model['params'])
return CohereAIEmbedProvider(model['name'], model['params'])
if model['provider'] == "togetherai":
return TogetherAIProvider(model['name'], model['params'])
return TogetherAIEmbedProvider(model['name'], model['params'])
if model['provider'] == "voyageai":
return VoyageAIProvider(model['name'], model['params'])
return VoyageAIEmbedProvider(model['name'], model['params'])

chat_models_path = os.path.join(os.path.dirname(__file__), "chat_models.json")
with open(chat_models_path, "r") as f:
chat_model_list = json.load(f)
chat_model_dict = {model['id']: model for model in chat_model_list}

def get_chat_model(id):
"""Returns a ModelProvider instance for the given model id."""
model = chat_model_dict[id]
if model['provider'] == "transformers":
return TransformersChatProvider(model['name'], model['params'])
if model['provider'] == "openai":
return OpenAIChatProvider(model['name'], model['params'])
48 changes: 48 additions & 0 deletions models/chat_models.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
[
{
"provider": "transformers",
"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"sanitized_name": "TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"id": "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"params": {
"max_tokens": 2048
}
},
{
"provider": "openai",
"name": "gpt-3.5-turbo",
"id": "openai-gpt-3.5-turbo",
"params": {
"max_tokens": 4096
}
},
{
"provider": "openai",
"name": "gpt-4-1106-preview",
"id": "openai-gpt-4-1106-preview",
"params": {
"max_tokens": 8192
}
},
{
"provider": "cohereai",
"name": "embed-english-v3.0",
"id": "cohereai-embed-english-v3.0",
"params": {
"input_type": "clustering"
}
},
{
"provider": "togetherai",
"name": "togethercomputer/m2-bert-80M-2k-retrieval",
"sanitized_name": "togethercomputer___m2-bert-80M-2k-retrieval",
"id": "togetherai-togethercomputer___m2-bert-80M-2k-retrieval",
"modality": "text",
"params": {
"truncation": true,
"padding": true,
"max_tokens": 2048
}
}

]
File renamed without changes.
13 changes: 12 additions & 1 deletion models/providers/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
class ModelProvider:
class EmbedModelProvider:
def __init__(self, name, params):
self.name = name
self.params = params
Expand All @@ -9,3 +9,14 @@ def load_model(self):
def embed(self, text):
raise NotImplementedError("This method should be implemented by subclasses.")

class ChatModelProvider:
def __init__(self, name, params):
self.name = name
self.params = params

def load_model(self):
raise NotImplementedError("This method should be implemented by subclasses.")

def chat(self, messages):
raise NotImplementedError("This method should be implemented by subclasses.")

4 changes: 2 additions & 2 deletions models/providers/cohereai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import time
import cohere
from dotenv import load_dotenv
from .base import ModelProvider
from .base import EmbedModelProvider

load_dotenv()

class CohereAIProvider(ModelProvider):
class CohereAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = cohere.Client(os.getenv("COHERE_API_KEY"))

Expand Down
22 changes: 17 additions & 5 deletions models/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import tiktoken
from openai import OpenAI
from dotenv import load_dotenv
from .base import ModelProvider
from .base import EmbedModelProvider, ChatModelProvider

load_dotenv()

class OpenAIProvider(ModelProvider):
class OpenAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.encoder = tiktoken.encoding_for_model("text-embedding-ada-002")
self.encoder = tiktoken.encoding_for_model(self.name)

def embed(self, inputs):
time.sleep(0.01) # TODO proper rate limiting
Expand All @@ -20,7 +20,19 @@ def embed(self, inputs):
inputs = [enc.decode(enc.encode(b)[:max_tokens]) if len(enc.encode(b)) > max_tokens else b for b in inputs]
response = self.client.embeddings.create(
input=inputs,
model="text-embedding-ada-002",
model=self.name,
)
embeddings = [embedding.embedding for embedding in response.data]
return embeddings
return embeddings

class OpenAIChatProvider(ChatModelProvider):
def load_model(self):
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.encoder = tiktoken.encoding_for_model(self.name)

def chat(self, messages):
response = self.client.chat.completions.create(
model=self.name,
messages=messages
)
return response.choices[0].message.content
4 changes: 2 additions & 2 deletions models/providers/togetherai.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import tiktoken
import together
from dotenv import load_dotenv
from .base import ModelProvider
from .base import EmbedModelProvider

load_dotenv()

class TogetherAIProvider(ModelProvider):
class TogetherAIEmbedProvider(EmbedModelProvider):
def load_model(self):
together.api_key = os.getenv("TOGETHER_API_KEY")
self.client = together.Together()
Expand Down
24 changes: 20 additions & 4 deletions models/providers/transformers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
from .base import ModelProvider
from transformers import AutoTokenizer, AutoModel
from .base import EmbedModelProvider, ChatModelProvider
from transformers import AutoTokenizer, AutoModel, pipeline

def cls_pooling(model_output):
return model_output[0][:, 0]
Expand All @@ -15,7 +15,7 @@ def mean_pooling(model_output, attention_mask):
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class TransformersProvider(ModelProvider):
class TransformersEmbedProvider(EmbedModelProvider):
def load_model(self):
self.tokenizer = AutoTokenizer.from_pretrained(self.name)
self.model = AutoModel.from_pretrained(self.name, trust_remote_code=True)
Expand All @@ -36,4 +36,20 @@ def embed(self, inputs):

# Normalize embeddings
normalized_embeedings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return normalized_embeedings.tolist()
return normalized_embeedings.tolist()


class TransformersChatProvider(ChatModelProvider):
def load_model(self):
# self.pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="auto")
# self.pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="cpu")
# TODO: support bfloat16 for non mac environmentss
self.pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
self.encoder = self.pipe.tokenizer

def chat(self, messages, max_new_tokens=24):
prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = self.pipe(prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
generated_text = outputs[0]["generated_text"]
print("GENERATED TEXT", generated_text)
return generated_text.split("<|assistant|>")[1].strip()
4 changes: 2 additions & 2 deletions models/providers/voyageai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import time
import voyageai
from dotenv import load_dotenv
from .base import ModelProvider
from .base import EmbedModelProvider

load_dotenv()

class VoyageAIProvider(ModelProvider):
class VoyageAIEmbedProvider(EmbedModelProvider):
def load_model(self):
# voyageai.api_key = os.getenv("VOYAGE_API_KEY")
self.client = voyageai.Client(os.getenv("VOYAGE_API_KEY"))
Expand Down
4 changes: 2 additions & 2 deletions python_server/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# TODO is this hacky way to import from the models directory?
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from models import get_model
from models import get_embedding_model

# Create a Blueprint
search_bp = Blueprint('search_bp', __name__)
Expand All @@ -26,7 +26,7 @@ def nn():
num = 150
if model_id not in MODELS:
print("loading model", model_id)
model = get_model(model_id)
model = get_embed_model(model_id)
model.load_model()
MODELS[model_id] = model
else:
Expand Down
4 changes: 2 additions & 2 deletions python_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ def send_file(datasetPath):
print("req url", request.url)
return send_from_directory(os.path.join(os.getcwd(), '../data/'), datasetPath)

@app.route('/models', methods=['GET'])
@app.route('/embeding_models', methods=['GET'])
def get_models():
directory_path = os.path.join(os.getcwd(), '../models/')
file_path = os.path.join(directory_path, 'models.json')
file_path = os.path.join(directory_path, 'embedding_models.json')
with open(file_path, 'r', encoding='utf-8') as file:
models = json.load(file)
return jsonify(models)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
accelerate==0.26.1
aiohttp==3.9.1
aiolimiter==1.1.0
aiosignal==1.3.1
Expand Down
4 changes: 2 additions & 2 deletions scripts/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# TODO is this hacky way to import from the models directory?
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from models import get_model
from models import get_embedding_model

def chunked_iterable(iterable, size):
"""Yield successive chunks from an iterable."""
Expand All @@ -19,7 +19,7 @@ def embedder(dataset_name, text_column="text", model_id="transformers-BAAI___bge
df = pd.read_parquet(f"../data/{dataset_name}/input.parquet")
sentences = df[text_column].tolist()

model = get_model(model_id)
model = get_embedding_model(model_id)
print("loading", model.name)
model.load_model()

Expand Down
Loading

0 comments on commit a7b92e8

Please sign in to comment.