Skip to content

Commit

Permalink
fix issues with scripts in module. add scope command. support ingest …
Browse files Browse the repository at this point in the history
…parquet and jsonl.
  • Loading branch information
enjalot committed Feb 12, 2024
1 parent a5faf2a commit bff5241
Show file tree
Hide file tree
Showing 22 changed files with 210 additions and 76 deletions.
10 changes: 9 additions & 1 deletion DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,15 @@ TODO: flesh out these instructions
```
python setup.py sdist bdist_wheel
```
This builds the package, including the React app, and bundles it all up. This allows yo
This builds the package, including the React app, and bundles it all up.

```
deactivate
python setup.py sdist bdist_wheel
source testenv/bin/activate
pip install dist/latentscope-0.1.0-py3-none-any.whl
```


# Python Code
Expand Down
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,41 @@ Quickly embed, project, cluster and explore a dataset. I think of this project a

### Demo
This tool is meant to be run locally or on a trusted server to process data for viewing in the latent scope. You can see the result of the process in live demos:
* TODO: OpenOrca
* TODO: datavis survey responses
* TODO: Dolly 15k
* TODO: r/DadJokes
* TODO: emotion

TODO: YouTube getting started video

### Quick Start
To get started, install the [latent-scope module]() and run the server:

```bash
python -m venv venv
source venv/bin/activate
pip install latent-scope
ls-serve ~/local-scope-data
```

Then open your browser to http://localhost:5001 and upload your first dataset!

You can also ingest data from a Pandas dataframe:
```python
from latentscope import ls
ls.init("~/latent-scope-data")
ls.ingest("dadabase", df, text_column="joke")
ls.serve()
```
See the notebooks linked below for detailed examples of using the Python interface.

### Notebooks
You can also configure and run the server from inside python, see these notebooks for examples of preparing and loading data:
* [dvs-survey](notebooks/dvs-survey.ipynb) - a small test dataset of 700 rows to quickly illustrate the process
* [dadabase](notebooks/dadabase.ipynb) - a more interesting (and funny) dataset of 50k rows

### Command line scripts
When latent-scope is installed, it creates a suite of command line scripts
When latent-scope is installed, it creates a suite of command line scripts that can be used to setup scopes for exploring and running the web application.

```bash
# like above, we make sure to install latent-scope
Expand All @@ -50,8 +62,9 @@ ls-umap datavis-misunderstood embedding-001 25 .1
ls-cluster datavis-misunderstood umap-001 5 5
# ls-label dataset_id text_column cluster_id model_id context
ls-label datavis-misunderstood "answer" cluster-001 transformers-HuggingFaceH4___zephyr-7b-beta
# ls-scope dataset_id labels_id name description
# ls-scope dataset_id embedding_id umap_id cluster_id cluster_labels_id label description
ls-scope datavis-misunderstood cluster-001-labels-001 "E5 demo" "E5 embeddings summarized by Zephyr 7B"
# start the server to explore your scope
ls-serve
```

Expand Down
3 changes: 3 additions & 0 deletions latentscope/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from .scripts.umapper import umapper as umap
from .scripts.cluster import clusterer as cluster
from .scripts.label_clusters import labeler as label
from .scripts.scope import scope

from .server import serve

from .util import update_data_dir, get_data_dir, set_openai_key, set_voyage_key, set_together_key, set_cohere_key, set_mistral_key

Expand Down
Empty file.
6 changes: 5 additions & 1 deletion latentscope/models/providers/cohereai.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@

class CohereAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = cohere.Client(get_key("COHERE_API_KEY"))
api_key = get_key("COHERE_API_KEY")
if api_key is None:
print("ERROR: No API key found for Cohere")
print("Missing 'COHERE_API_KEY' variable in:", f"{os.getcwd()}/.env")
self.client = cohere.Client(api_key)

def embed(self, inputs):
time.sleep(0.01) # TODO proper rate limiting
Expand Down
12 changes: 10 additions & 2 deletions latentscope/models/providers/mistralai.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@

class MistralAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = MistralClient(get_key("MISTRAL_API_KEY"))
api_key = get_key("MISTRAL_API_KEY")
if api_key is None:
print("ERROR: No API key found for Mistral")
print("Missing 'MISTRAL_API_KEY' variable in:", f"{os.getcwd()}/.env")
self.client = MistralClient(api_key=api_key)

def embed(self, inputs):
time.sleep(0.1) # TODO proper rate limiting
Expand All @@ -28,7 +32,11 @@ def embed(self, inputs):

class MistralAIChatProvider(ChatModelProvider):
def load_model(self):
self.client = MistralClient(api_key=get_key("MISTRAL_API_KEY"))
api_key = get_key("MISTRAL_API_KEY")
if api_key is None:
print("ERROR: No API key found for Mistral")
print("Missing 'MISTRAL_API_KEY' variable in:", f"{os.getcwd()}/.env")
self.client = MistralClient(api_key=api_key)
self.encoder = AutoTokenizer.from_pretrained(encoders[self.name])

def chat(self, messages):
Expand Down
6 changes: 5 additions & 1 deletion latentscope/models/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@

class OpenAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = OpenAI(api_key=get_key("OPENAI_API_KEY"))
api_key = get_key("OPENAI_API_KEY")
if api_key is None:
print("ERROR: No API key found for OpenAI")
print("Missing 'OPENAI_API_KEY' variable in:", f"{os.getcwd()}/.env")
self.client = OpenAI(api_key=api_key)
# special case for the new embedding models
if self.name in ["text-embedding-3-small", "text-embedding-3-large"]:
self.encoder = tiktoken.get_encoding("cl100k_base")
Expand Down
6 changes: 5 additions & 1 deletion latentscope/models/providers/togetherai.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@

class TogetherAIEmbedProvider(EmbedModelProvider):
def load_model(self):
together.api_key = get_key("TOGETHER_API_KEY")
api_key = get_key("TOGETHER_API_KEY")
if api_key is None:
print("ERROR: No API key found for Together")
print("Missing 'TOGETHER_API_KEY' variable in:", f"{os.getcwd()}/.env")
together.api_key = api_key
self.client = together.Together()
self.encoder = tiktoken.encoding_for_model("text-embedding-ada-002")

Expand Down
22 changes: 5 additions & 17 deletions latentscope/models/providers/voyageai.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,21 @@

class VoyageAIEmbedProvider(EmbedModelProvider):
def load_model(self):
self.client = voyageai.Client(get_key("VOYAGE_API_KEY"))
api_key = get_key("VOYAGE_API_KEY")
if api_key is None:
print("ERROR: No API key found for Voyage")
print("Missing 'VOYAGE_API_KEY' variable in:", f"{os.getcwd()}/.env")
self.client = voyageai.Client(api_key)
# The voyage client provides a tokenizer that only encodes https://docs.voyageai.com/tokenization/
# It also says that it uses the same tokenizer as Llama 2
self.encoder = Tokenizer.from_pretrained("TheBloke/Llama-2-70B-fp16")

def embed(self, inputs):
time.sleep(0.1) # TODO proper rate limiting

# We truncate the input ourselves, even though the API supports truncation its still possible to send too big a batch
enc = self.encoder
max_tokens = self.params["max_tokens"]
# print("max tokens", max_tokens)
# print("before", self.client.count_tokens(inputs))
# total = 0
# inputs = [enc.decode(enc.encode(b)[:max_tokens]) if len(enc.encode(b)) > max_tokens else b for b in inputs]
inputs = [enc.decode(enc.encode(b).ids[:max_tokens]) if len(enc.encode(b)) > max_tokens else b for b in inputs]

# for i in inputs:
# total += len(enc.encode(i))
# print(len(enc.encode(i)), self.client.count_tokens([i]))
# print("after", self.client.count_tokens(inputs))
# print("llama 2 total", total)
# import json
# print("JSON")
# print(json.dumps(inputs, indent=2))
# print(" ")

response = self.client.embed(texts=inputs, model=self.name, truncation=self.params["truncation"])
embeddings = response.embeddings
return embeddings
41 changes: 22 additions & 19 deletions latentscope/scripts/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,12 @@ def append_to_hdf5(file_path, new_data):
dataset = f.create_dataset(dataset_name, data=new_data, maxshape=maxshape, chunks=True)

def get_last_batch(file_path):
with h5py.File(file_path, 'r') as f:
dataset = f["embeddings"]
return dataset.shape[0]
try:
with h5py.File(file_path, 'r') as f:
dataset = f["embeddings"]
return dataset.shape[0]
except FileNotFoundError:
return 0


def main():
Expand All @@ -59,26 +62,14 @@ def main():
def embed(dataset_id, text_column, model_id, prefix, rerun):
DATA_DIR = get_data_dir()
df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "input.parquet"))
sentences = df[text_column].tolist()
prefixed = []
if prefix is None:
prefix = ""
for i,s in enumerate(sentences):
if s is None:
print(i,s)
s = ""
prefixed.append(prefix + s)
sentences = prefixed #[prefix + s for s in sentences]

# determine the embedding id

embedding_dir = os.path.join(DATA_DIR, dataset_id, "embeddings")
if not os.path.exists(embedding_dir):
os.makedirs(embedding_dir)

batch_size = 100
# sentence_embeddings = []
total_batches = len(sentences)//batch_size

# determine the embedding id
if rerun is not None:
embedding_id = rerun
starting_batch = get_last_batch(os.path.join(embedding_dir, f"{embedding_id}.h5")) // batch_size
Expand All @@ -101,6 +92,20 @@ def embed(dataset_id, text_column, model_id, prefix, rerun):
print("loading", model.name)
model.load_model()

print("Checking for empty inputs")
sentences = df[text_column].tolist()
prefixed = []
if prefix is None:
prefix = ""
for i,s in enumerate(sentences):
if s is None or s == "":
print(i,s, "text is empty, adding a [space]")
s = " "
prefixed.append(prefix + s)
sentences = prefixed #[prefix + s for s in sentences]

total_batches = len(sentences)//batch_size

print("embedding", len(sentences), "sentences", "in", total_batches, "batches")
if starting_batch > 0:
print("Rerunning starting at batch", starting_batch)
Expand Down Expand Up @@ -164,8 +169,6 @@ def embed_debug(parquet_file, model_id, text_column):
model.load_model()

for i,row in enumerate(df.iterrows()):
if(i != 34):
continue
print("batch index:", i)
print("original index:", row[0])
text = row[1][text_column]
Expand Down
29 changes: 20 additions & 9 deletions latentscope/scripts/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,32 @@
from latentscope.util import get_data_dir

# TODO make a parquet version of these
def csv():
def main():
parser = argparse.ArgumentParser(description='Ingest a dataset')
parser.add_argument('id', type=str, help='Dataset id (directory name in data folder)')
parser.add_argument('--path', type=str, help='Path to csv file, otherwise assumes input.csv in dataset directory')
parser.add_argument('--path', type=str, help='Path to csv or parquet file, otherwise assumes input.csv in dataset directory')
args = parser.parse_args()
ingest(args.id, args.path)
ingest_file(args.id, args.path)

def ingest_csv(dataset_id, csv_path):
def ingest_file(dataset_id, file_path):
DATA_DIR = get_data_dir()
directory = os.path.join(DATA_DIR, dataset_id)
if not csv_path:
csv_path = os.path.join(directory, "input.csv")
csv_file = os.path.join(csv_path)
print("reading", csv_file)
df = pd.read_csv(csv_file)
if not file_path:
file_path = os.path.join(directory, "input.csv")
file_type = file_path.split('.')[-1]
print(f"File type detected: {file_type}")
file = os.path.join(file_path)
print("reading", file)
if file_type == "csv":
df = pd.read_csv(file)
elif file_type == "parquet":
df = pd.read_parquet(file)
elif file_type == "jsonl":
with open(file, 'r') as f:
lines = f.readlines()
df = pd.DataFrame([json.loads(line) for line in lines])
else:
raise ValueError(f"Unsupported file type: {file_type}")
ingest(dataset_id, df)


Expand Down
52 changes: 52 additions & 0 deletions latentscope/scripts/scope.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import re
import json
import argparse
from latentscope.util import get_data_dir


def main():
parser = argparse.ArgumentParser(description='Setup a scope')
parser.add_argument('dataset_id', type=str, help='Dataset id (directory name in data folder)')
parser.add_argument('embedding_id', type=str, help='Embedding id')
parser.add_argument('umap_id', type=str, help='UMAP id')
parser.add_argument('cluster_id', type=str, help='Cluster id')
parser.add_argument('cluster_labels_id', type=str, help='Cluster labels id')
parser.add_argument('label', type=str, help='Label for the scope')
parser.add_argument('description', type=str, help='Description of the scope')

def scope(dataset_id, embedding_id, umap_id, cluster_id, cluster_labels_id, label, description):
DATA_DIR = get_data_dir()
print("DATA DIR", DATA_DIR)
directory = os.path.join(DATA_DIR, dataset_id, "scopes")

def get_next_scopes_number(dataset):
# figure out the latest scope number
scopes_files = [f for f in os.listdir(directory) if re.match(r"scopes-\d+\.json", f)]
if len(scopes_files) > 0:
last_scopes = sorted(scopes_files)[-1]
last_scopes_number = int(last_scopes.split("-")[1].split(".")[0])
next_scopes_number = last_scopes_number + 1
else:
next_scopes_number = 1
return next_scopes_number

next_scopes_number = get_next_scopes_number(dataset_id)
# make the umap name from the number, zero padded to 3 digits
id = f"scopes-{next_scopes_number:03d}"
print("RUNNING:", id)

scope = {
"id": id,
"embedding_id": embedding_id,
"umap_id": umap_id,
"cluster_id": cluster_id,
"cluster_labels_id": cluster_labels_id,
"label": label,
"description": description
}

file_path = os.path.join(directory, id + ".json")
with open(file_path, 'w') as f:
json.dump(scope, f, indent=2)
print("wrote scope", id)
6 changes: 5 additions & 1 deletion latentscope/server/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ def start():
# This sets the environment variable expected
update_data_dir(args.data_dir)
from .app import serve
serve(args.host, args.port, args.debug)
serve(args.host, args.port, args.debug)

def serve(host="0.0.0.0", port=5001, debug=False):
from .app import serve
serve(host, port, debug)
6 changes: 4 additions & 2 deletions latentscope/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
os.makedirs(DATA_DIR)

# We enable a read only mode of the server
def str_to_bool(s):
def check_read_only(s):
if s is None:
return False
return s.lower() in ['true', '1', 't', 'y', 'yes']
# export LATENT_SCOPE_READ_ONLY=1
READ_ONLY = str_to_bool(os.getenv("LATENT_SCOPE_READ_ONLY"))
READ_ONLY = check_read_only(os.getenv("LATENT_SCOPE_READ_ONLY"))
print("READ ONLY?", READ_ONLY, not READ_ONLY)

# in memory cache of dataframes loaded for each dataset
Expand Down
5 changes: 3 additions & 2 deletions latentscope/server/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,11 @@ def run_ingest():
dataset_dir = os.path.join(DATA_DIR, dataset)
if not os.path.exists(dataset_dir):
os.makedirs(dataset_dir)
file.save(os.path.join(dataset_dir, "input.csv"))
file_path = os.path.join(dataset_dir, file.filename)
file.save(file_path)

job_id = str(uuid.uuid4())
command = f'ls-ingest-csv {dataset}'
command = f'ls-ingest {dataset} --path={file_path}'
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

Expand Down
Loading

0 comments on commit bff5241

Please sign in to comment.