Skip to content

Commit

Permalink
fix setup scope logic. add mistral embed and chat
Browse files Browse the repository at this point in the history
  • Loading branch information
enjalot committed Jan 23, 2024
1 parent a7b92e8 commit 35da197
Show file tree
Hide file tree
Showing 10 changed files with 217 additions and 97 deletions.
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
OPENAI_API_KEY=XXX
VOYAGE_API_KEY=XXX
TOGETHER_API_KEY=XXX
COHERE_API_KEY=XXX
COHERE_API_KEY=XXX
MISTRAL_API_KEY=XXX
79 changes: 46 additions & 33 deletions client/src/components/DatasetSetup.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ function DatasetSetup() {
fetch(`http://localhost:5001/embedding_models`)
.then(response => response.json())
.then(data => {
// console.log("models", data)
console.log("models", data)
setModels(data)
});
}, []);
Expand Down Expand Up @@ -97,7 +97,7 @@ function DatasetSetup() {
const { startJob: deleteUmapJob } = useStartJobPolling(dataset, setUmapJob, 'http://localhost:5001/jobs/delete/umap');

const [umaps, setUmaps] = useState([]);
useEffect(() => {
function fetchUmaps(datasetId, callback) {
fetch(`http://localhost:5001/datasets/${datasetId}/umaps`)
.then(response => response.json())
.then(data => {
Expand All @@ -107,9 +107,12 @@ function DatasetSetup() {
url: `http://localhost:5001/files/${datasetId}/umaps/${d.name}.png`,
}
})
setUmaps(array.reverse())
callback(array.reverse())
});
}, [datasetId, umapJob]);
}
useEffect(() => {
fetchUmaps(datasetId, setUmaps)
}, [datasetId, setUmaps, umapJob]);

const [umap, setUmap] = useState(umaps[0]);
useEffect(() => {
Expand Down Expand Up @@ -139,6 +142,8 @@ function DatasetSetup() {
// console.log("umap points", data)
setPoints(data.map(d => [d.x, d.y]))
})
} else {
setPoints([])
}
}, [dataset, umap])

Expand All @@ -151,7 +156,7 @@ function DatasetSetup() {
const { startJob: deleteClusterJob } = useStartJobPolling(dataset, setClusterJob, 'http://localhost:5001/jobs/delete/cluster');

const [clusters, setClusters] = useState([]);
useEffect(() => {
function fetchClusters(datasetId, callback) {
fetch(`http://localhost:5001/datasets/${datasetId}/clusters`)
.then(response => response.json())
.then(data => {
Expand All @@ -162,9 +167,12 @@ function DatasetSetup() {
}
})
// console.log("clusters", clusters)
setClusters(array.reverse())
callback(array.reverse())
});
}, [datasetId, clusterJob]);
}
useEffect(() => {
fetchClusters(datasetId, setClusters)
}, [datasetId, setClusters, clusterJob]);

const [cluster, setCluster] = useState(clusters[0]);
useEffect(() => {
Expand All @@ -181,7 +189,6 @@ function DatasetSetup() {
fetch(`http://localhost:5001/datasets/${datasetId}/clusters/${cluster.cluster_name}/labels`)
.then(response => response.json())
.then(data => {
console.log("update labels", cluster, data)
setClusterLabels(data)
}).catch(err => {
console.log(err)
Expand Down Expand Up @@ -215,21 +222,29 @@ function DatasetSetup() {
}, [datasetId, setScopes]);

useEffect(() => {
// TODO: this seems like a runaround on React.
// I want to have umap and cluster be set by the scope
// but i don't want to override any temporary changes to umap and cluster
// real solution probably involves some kind of staging state
async function setters(scope) {
setScope(scope)
setEmbedding(scope.embeddings)
const tumaps = await new Promise((resolve) => fetchUmaps(datasetId, (data) => resolve(data)));
const selectedUmap = tumaps.find(u => u.name === scope.umap);
const tclusters = await new Promise((resolve) => fetchClusters(datasetId, (data) => resolve(data)));
const selectedCluster = tclusters.find(c => c.cluster_name === scope.cluster);
setUmap(selectedUmap);
setCluster(selectedCluster);
}
if(scopeId && scopes.length) {
const scope = scopes.find(d => d.name == scopeId)
if(scope) {
setScope(scope)
setEmbedding(scope.embeddings)
const selectedUmap = umaps.find(u => u.name === scope.umap);
const selectedCluster = clusters.find(c => c.cluster_name === scope.cluster);
setUmap(selectedUmap);
setCluster(selectedCluster);
setters(scope)
}

} else {
setScope(null)
}
}, [scopeId, scopes, umaps, clusters, setScope, setUmap, setCluster])
}, [datasetId, scopeId, scopes, setScope, setUmap, setCluster])

const navigate = useNavigate();
const handleSaveScope = useCallback((event) => {
Expand Down Expand Up @@ -319,19 +334,18 @@ function DatasetSetup() {
<div>
Embedding on <b>{textColumn}</b>
</div>
{!embeddingsJob ?
<form onSubmit={handleNewEmbedding}>
<div>
<label htmlFor="modelName">Model:</label>
<select id="modelName" name="modelName">
<select id="modelName" name="modelName" disabled={!!embeddingsJob}>
{models.map((model, index) => (
<option key={index} value={model.id}>{model.provider}: {model.name}</option>
))}
</select>
</div>
<button type="submit">New Embedding</button>
</form> :
<JobProgress job={embeddingsJob} clearJob={()=> setEmbeddingsJob(null)} /> }
<button type="submit" disabled={!!embeddingsJob}>New Embedding</button>
</form>
<JobProgress job={embeddingsJob} clearJob={()=> setEmbeddingsJob(null)} />
<div className="dataset--setup-embeddings-list">
{embeddings.map((emb, index) => (
<div key={index}>
Expand All @@ -352,24 +366,23 @@ function DatasetSetup() {
<div className="dataset--setup-umaps">
<h3>2. UMAP </h3>
<div className="dataset--umaps-new">
{!umapJob ?
<form onSubmit={handleNewUmap}>
<label>
Neighbors:
<input type="number" name="neighbors" defaultValue="50"/>
<input type="number" name="neighbors" defaultValue="50"disabled={!!umapJob} />
</label>
<label>
Min Dist:
<input type="text" name="min_dist" defaultValue="0.1" />
<input type="text" name="min_dist" defaultValue="0.1" disabled={!!umapJob} />
</label>
<button type="submit">New UMAP</button>
<button type="submit" disabled={!!umapJob}>New UMAP</button>
</form>
: <JobProgress job={umapJob} clearJob={()=>setUmapJob(null)}/> }
<JobProgress job={umapJob} clearJob={()=>setUmapJob(null)}/>
</div>
<div className="dataset--setup-umaps-list">
{umaps.filter(d => d.embeddings == embedding).map((um, index) => (
<div className="dataset--setup-umaps-item" key={index}>
<input type="radio" id={`umap${index}`} name="umap" value={um} checked={um === umap} onChange={() => setUmap(um)} />
<input type="radio" id={`umap${index}`} name="umap" value={um} checked={um.name === umap?.name} onChange={() => setUmap(um)} />
<label htmlFor={`umap${index}`}>{um.name}
<br></br>
Neighbors: {um.neighbors}<br/>
Expand All @@ -386,18 +399,18 @@ function DatasetSetup() {
<div className="dataset--setup-clusters">
<h3>3. Clusters</h3>
<div className="dataset--clusters-new">
{!clusterJob ?
<form onSubmit={(e) => handleNewCluster(e, umap)}>
<label>
Samples:
<input type="number" name="samples" defaultValue="30"/>
<input type="number" name="samples" defaultValue="30" disabled={!!clusterJob}/>
</label><br/>
<label>
Min Samples:
<input type="number" name="min_samples" defaultValue="5" />
<input type="number" name="min_samples" defaultValue="5" disabled={!!clusterJob} />
</label>
<button type="submit">New Clusters</button>
</form> : <JobProgress job={clusterJob} clearJob={()=>setClusterJob(null)} /> }
<button type="submit" disabled={!!clusterJob}>New Clusters</button>
</form>
<JobProgress job={clusterJob} clearJob={()=>setClusterJob(null)} />
</div>
<div className="dataset--setup-clusters-list">
{umap && clusters.filter(d => d.umap_name == umap.name).map((cl, index) => (
Expand All @@ -406,7 +419,7 @@ function DatasetSetup() {
id={`cluster${index}`}
name="cluster"
value={cluster}
checked={cl === cluster}
checked={cl.cluster_name === cluster?.cluster_name}
onChange={() => setCluster(cl)} />
<label htmlFor={`cluster${index}`}>{cl.cluster_name}
<br></br>
Expand Down
22 changes: 13 additions & 9 deletions models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
import json
from .providers.transformers import TransformersEmbedProvider, TransformersChatProvider
from .providers.openai import OpenAIEmbedProvider, OpenAIChatProvider
from .providers.mistralai import MistralAIEmbedProvider, MistralAIChatProvider
from .providers.cohereai import CohereAIEmbedProvider
from .providers.togetherai import TogetherAIEmbedProvider
from .providers.voyageai import VoyageAIEmbedProvider


embed_models_path = os.path.join(os.path.dirname(__file__), "embedding_models.json")
with open(embed_models_path, "r") as f:
embed_model_list = json.load(f)
embed_model_dict = {model['id']: model for model in embed_model_list}

def get_embedding_model(id):
"""Returns a ModelProvider instance for the given model id."""
embed_models_path = os.path.join(os.path.dirname(__file__), "embedding_models.json")
with open(embed_models_path, "r") as f:
embed_model_list = json.load(f)
embed_model_dict = {model['id']: model for model in embed_model_list}
model = embed_model_dict[id]
if not model:
raise ValueError(f"Model {id} not found")
Expand All @@ -22,22 +22,26 @@ def get_embedding_model(id):
return TransformersEmbedProvider(model['name'], model['params'])
if model['provider'] == "openai":
return OpenAIEmbedProvider(model['name'], model['params'])
if model['provider'] == "mistralai":
return MistralAIEmbedProvider(model['name'], model['params'])
if model['provider'] == "cohereai":
return CohereAIEmbedProvider(model['name'], model['params'])
if model['provider'] == "togetherai":
return TogetherAIEmbedProvider(model['name'], model['params'])
if model['provider'] == "voyageai":
return VoyageAIEmbedProvider(model['name'], model['params'])

chat_models_path = os.path.join(os.path.dirname(__file__), "chat_models.json")
with open(chat_models_path, "r") as f:
chat_model_list = json.load(f)
chat_model_dict = {model['id']: model for model in chat_model_list}

def get_chat_model(id):
"""Returns a ModelProvider instance for the given model id."""
chat_models_path = os.path.join(os.path.dirname(__file__), "chat_models.json")
with open(chat_models_path, "r") as f:
chat_model_list = json.load(f)
chat_model_dict = {model['id']: model for model in chat_model_list}
model = chat_model_dict[id]
if model['provider'] == "transformers":
return TransformersChatProvider(model['name'], model['params'])
if model['provider'] == "openai":
return OpenAIChatProvider(model['name'], model['params'])
if model['provider'] == "mistralai":
return MistralAIChatProvider(model['name'], model['params'])
103 changes: 68 additions & 35 deletions models/chat_models.json
Original file line number Diff line number Diff line change
@@ -1,48 +1,81 @@
[
{
{
"provider": "transformers",
"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"sanitized_name": "TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"id": "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"params": {
"max_tokens": 2048
}
},
{
"provider": "transformers",
"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"sanitized_name": "TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"id": "transformers-TinyLlama___TinyLlama-1.1B-Chat-v1.0",
"name": "HuggingFaceH4/zephyr-7b-beta",
"sanitized_name": "HuggingFaceH4___zephyr-7b-beta",
"id": "transformers-HuggingFaceH4___zephyr-7b-beta",
"params": {
"max_tokens": 2048
"max_tokens": 4096
}
},
{
"provider": "openai",
"name": "gpt-3.5-turbo",
"id": "openai-gpt-3.5-turbo",
"params": {
"max_tokens": 4096
}
},
{
"provider": "openai",
"name": "gpt-4-1106-preview",
"id": "openai-gpt-4-1106-preview",
},
{
"provider": "openai",
"name": "gpt-3.5-turbo",
"id": "openai-gpt-3.5-turbo",
"params": {
"max_tokens": 4096
}
},
{
"provider": "openai",
"name": "gpt-4-1106-preview",
"id": "openai-gpt-4-1106-preview",
"params": {
"max_tokens": 8192
}
},
{
"provider": "mistralai",
"name": "mistral-tiny",
"id": "mistralai-mistral-tiny",
"params": {
"max_tokens": 8192
}
},
{
"provider": "cohereai",
"name": "embed-english-v3.0",
"id": "cohereai-embed-english-v3.0",
},
{
"provider": "mistralai",
"name": "mistral-small",
"id": "mistralai-mistral-small",
"params": {
"input_type": "clustering"
"max_tokens": 8192
}
},
{
"provider": "togetherai",
"name": "togethercomputer/m2-bert-80M-2k-retrieval",
"sanitized_name": "togethercomputer___m2-bert-80M-2k-retrieval",
"id": "togetherai-togethercomputer___m2-bert-80M-2k-retrieval",
"modality": "text",
},
{
"provider": "mistralai",
"name": "mistral-medium",
"id": "mistralai-mistral-medium",
"params": {
"truncation": true,
"padding": true,
"max_tokens": 2048
"max_tokens": 8192
}
}
},
{
"provider": "cohereai",
"name": "embed-english-v3.0",
"id": "cohereai-embed-english-v3.0",
"params": {
"input_type": "clustering"
}
},
{
"provider": "togetherai",
"name": "togethercomputer/m2-bert-80M-2k-retrieval",
"sanitized_name": "togethercomputer___m2-bert-80M-2k-retrieval",
"id": "togetherai-togethercomputer___m2-bert-80M-2k-retrieval",
"modality": "text",
"params": {
"truncation": true,
"padding": true,
"max_tokens": 2048
}
}

]
7 changes: 7 additions & 0 deletions models/embedding_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@
"max_tokens": 8192
}
},
{
"provider": "mistralai",
"name": "mistral-embed",
"id": "mistralai-mistral-embed",
"params": {
}
},
{
"provider": "cohereai",
"name": "embed-english-v3.0",
Expand Down
Loading

0 comments on commit 35da197

Please sign in to comment.