fix issues with scripts in module. add scope command. support ingest …

…parquet and jsonl.
enjalot · Feb 12, 2024 · bff5241 · bff5241
1 parent a5faf2a
commit bff5241
Show file tree

Hide file tree

Showing 22 changed files with 210 additions and 76 deletions.
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -41,7 +41,15 @@ TODO: flesh out these instructions
 ```
 python setup.py sdist bdist_wheel
 ```
-This builds the package, including the React app, and bundles it all up. This allows yo
+This builds the package, including the React app, and bundles it all up. 
+
+```
+deactivate
+python setup.py sdist bdist_wheel
+source testenv/bin/activate
+pip install dist/latentscope-0.1.0-py3-none-any.whl
+
+```
 
 
 # Python Code

diff --git a/README.md b/README.md
@@ -4,29 +4,41 @@ Quickly embed, project, cluster and explore a dataset. I think of this project a
 
 ### Demo
 This tool is meant to be run locally or on a trusted server to process data for viewing in the latent scope. You can see the result of the process in live demos:
-* TODO: OpenOrca
+* TODO: datavis survey responses
 * TODO: Dolly 15k
 * TODO: r/DadJokes
+* TODO: emotion
 
 TODO: YouTube getting started video
 
 ### Quick Start
 To get started, install the [latent-scope module]() and run the server:
+
 ```bash
 python -m venv venv
 source venv/bin/activate
 pip install latent-scope
 ls-serve ~/local-scope-data
 ```
+
 Then open your browser to http://localhost:5001 and upload your first dataset!
 
+You can also ingest data from a Pandas dataframe:
+```python
+from latentscope import ls
+ls.init("~/latent-scope-data")
+ls.ingest("dadabase", df, text_column="joke")
+ls.serve()
+```
+See the notebooks linked below for detailed examples of using the Python interface.
+
 ### Notebooks
 You can also configure and run the server from inside python, see these notebooks for examples of preparing and loading data:
 * [dvs-survey](notebooks/dvs-survey.ipynb) - a small test dataset of 700 rows to quickly illustrate the process
 * [dadabase](notebooks/dadabase.ipynb) - a more interesting (and funny) dataset of 50k rows
 
 ### Command line scripts
-When latent-scope is installed, it creates a suite of command line scripts
+When latent-scope is installed, it creates a suite of command line scripts that can be used to setup scopes for exploring and running the web application.
 
 ```bash
 # like above, we make sure to install latent-scope
@@ -50,8 +62,9 @@ ls-umap datavis-misunderstood embedding-001 25 .1
 ls-cluster datavis-misunderstood umap-001 5 5
 # ls-label dataset_id text_column cluster_id model_id context
 ls-label datavis-misunderstood "answer" cluster-001 transformers-HuggingFaceH4___zephyr-7b-beta
-# ls-scope  dataset_id labels_id name description
+# ls-scope  dataset_id embedding_id umap_id cluster_id cluster_labels_id label description
 ls-scope datavis-misunderstood cluster-001-labels-001 "E5 demo" "E5 embeddings summarized by Zephyr 7B"
+# start the server to explore your scope
 ls-serve
 ```
 

diff --git a/latentscope/__init__.py b/latentscope/__init__.py
@@ -5,6 +5,9 @@
 from .scripts.umapper import umapper as umap
 from .scripts.cluster import clusterer as cluster
 from .scripts.label_clusters import labeler as label
+from .scripts.scope import scope
+
+from .server import serve
 
 from .util import update_data_dir, get_data_dir, set_openai_key, set_voyage_key, set_together_key, set_cohere_key, set_mistral_key
 

diff --git a/latentscope/models/providers/__init__.py b/latentscope/models/providers/__init__.py
diff --git a/latentscope/models/providers/cohereai.py b/latentscope/models/providers/cohereai.py
@@ -7,7 +7,11 @@
 
 class CohereAIEmbedProvider(EmbedModelProvider):
     def load_model(self):
-        self.client = cohere.Client(get_key("COHERE_API_KEY"))
+        api_key = get_key("COHERE_API_KEY")
+        if api_key is None:
+            print("ERROR: No API key found for Cohere")
+            print("Missing 'COHERE_API_KEY' variable in:", f"{os.getcwd()}/.env")
+        self.client = cohere.Client(api_key)
 
     def embed(self, inputs):
         time.sleep(0.01) # TODO proper rate limiting

diff --git a/latentscope/models/providers/mistralai.py b/latentscope/models/providers/mistralai.py
@@ -19,7 +19,11 @@
 
 class MistralAIEmbedProvider(EmbedModelProvider):
     def load_model(self):
-        self.client = MistralClient(get_key("MISTRAL_API_KEY"))
+        api_key = get_key("MISTRAL_API_KEY")
+        if api_key is None:
+            print("ERROR: No API key found for Mistral")
+            print("Missing 'MISTRAL_API_KEY' variable in:", f"{os.getcwd()}/.env")
+        self.client = MistralClient(api_key=api_key)
 
     def embed(self, inputs):
         time.sleep(0.1) # TODO proper rate limiting
@@ -28,7 +32,11 @@ def embed(self, inputs):
 
 class MistralAIChatProvider(ChatModelProvider):
     def load_model(self):
-        self.client = MistralClient(api_key=get_key("MISTRAL_API_KEY"))
+        api_key = get_key("MISTRAL_API_KEY")
+        if api_key is None:
+            print("ERROR: No API key found for Mistral")
+            print("Missing 'MISTRAL_API_KEY' variable in:", f"{os.getcwd()}/.env")
+        self.client = MistralClient(api_key=api_key)
         self.encoder = AutoTokenizer.from_pretrained(encoders[self.name])
 
     def chat(self, messages):

diff --git a/latentscope/models/providers/openai.py b/latentscope/models/providers/openai.py
@@ -8,7 +8,11 @@
 
 class OpenAIEmbedProvider(EmbedModelProvider):
     def load_model(self):
-        self.client = OpenAI(api_key=get_key("OPENAI_API_KEY"))
+        api_key = get_key("OPENAI_API_KEY")
+        if api_key is None:
+            print("ERROR: No API key found for OpenAI")
+            print("Missing 'OPENAI_API_KEY' variable in:", f"{os.getcwd()}/.env")
+        self.client = OpenAI(api_key=api_key)
         # special case for the new embedding models
         if self.name in ["text-embedding-3-small", "text-embedding-3-large"]:
             self.encoder = tiktoken.get_encoding("cl100k_base")

diff --git a/latentscope/models/providers/togetherai.py b/latentscope/models/providers/togetherai.py
@@ -8,7 +8,11 @@
 
 class TogetherAIEmbedProvider(EmbedModelProvider):
     def load_model(self):
-        together.api_key = get_key("TOGETHER_API_KEY")
+        api_key = get_key("TOGETHER_API_KEY")
+        if api_key is None:
+            print("ERROR: No API key found for Together")
+            print("Missing 'TOGETHER_API_KEY' variable in:", f"{os.getcwd()}/.env")
+        together.api_key = api_key
         self.client = together.Together()
         self.encoder = tiktoken.encoding_for_model("text-embedding-ada-002")
 

diff --git a/latentscope/models/providers/voyageai.py b/latentscope/models/providers/voyageai.py
@@ -10,33 +10,21 @@
 
 class VoyageAIEmbedProvider(EmbedModelProvider):
     def load_model(self):
-        self.client = voyageai.Client(get_key("VOYAGE_API_KEY"))
+        api_key = get_key("VOYAGE_API_KEY")
+        if api_key is None:
+            print("ERROR: No API key found for Voyage")
+            print("Missing 'VOYAGE_API_KEY' variable in:", f"{os.getcwd()}/.env")
+        self.client = voyageai.Client(api_key)
         # The voyage client provides a tokenizer that only encodes https://docs.voyageai.com/tokenization/
         # It also says that it uses the same tokenizer as Llama 2
         self.encoder = Tokenizer.from_pretrained("TheBloke/Llama-2-70B-fp16")
 
     def embed(self, inputs):
         time.sleep(0.1) # TODO proper rate limiting
-
         # We truncate the input ourselves, even though the API supports truncation its still possible to send too big a batch
         enc = self.encoder
         max_tokens = self.params["max_tokens"]
-        # print("max tokens", max_tokens)
-        # print("before", self.client.count_tokens(inputs))
-        # total = 0
-        # inputs = [enc.decode(enc.encode(b)[:max_tokens]) if len(enc.encode(b)) > max_tokens else b for b in inputs]
         inputs = [enc.decode(enc.encode(b).ids[:max_tokens]) if len(enc.encode(b)) > max_tokens else b for b in inputs]
-
-        # for i in inputs:
-        #     total += len(enc.encode(i))
-        #     print(len(enc.encode(i)), self.client.count_tokens([i]))
-        # print("after", self.client.count_tokens(inputs))
-        # print("llama 2 total", total)
-        # import json
-        # print("JSON")
-        # print(json.dumps(inputs, indent=2))
-        # print(" ")
-
         response = self.client.embed(texts=inputs, model=self.name, truncation=self.params["truncation"])
         embeddings = response.embeddings
         return embeddings
diff --git a/latentscope/scripts/embed.py b/latentscope/scripts/embed.py
@@ -39,9 +39,12 @@ def append_to_hdf5(file_path, new_data):
             dataset = f.create_dataset(dataset_name, data=new_data, maxshape=maxshape, chunks=True)
 
 def get_last_batch(file_path):
-    with h5py.File(file_path, 'r') as f:
-        dataset = f["embeddings"]
-        return dataset.shape[0]
+    try:
+        with h5py.File(file_path, 'r') as f:
+            dataset = f["embeddings"]
+            return dataset.shape[0]
+    except FileNotFoundError:
+        return 0
 
 
 def main():
@@ -59,26 +62,14 @@ def main():
 def embed(dataset_id, text_column, model_id, prefix, rerun):
     DATA_DIR = get_data_dir()
     df = pd.read_parquet(os.path.join(DATA_DIR, dataset_id, "input.parquet"))
-    sentences = df[text_column].tolist()
-    prefixed = []
-    if prefix is None:
-        prefix = ""
-    for i,s in enumerate(sentences):
-        if s is None:
-            print(i,s)
-            s = ""
-        prefixed.append(prefix + s)
-    sentences = prefixed #[prefix + s for s in sentences]
-
-    # determine the embedding id
+
     embedding_dir = os.path.join(DATA_DIR, dataset_id, "embeddings")
     if not os.path.exists(embedding_dir):
         os.makedirs(embedding_dir)
 
     batch_size = 100
-    # sentence_embeddings = []
-    total_batches = len(sentences)//batch_size
 
+    # determine the embedding id
     if rerun is not None:
         embedding_id = rerun
         starting_batch = get_last_batch(os.path.join(embedding_dir, f"{embedding_id}.h5")) // batch_size
@@ -101,6 +92,20 @@ def embed(dataset_id, text_column, model_id, prefix, rerun):
     print("loading", model.name)
     model.load_model()
 
+    print("Checking for empty inputs")
+    sentences = df[text_column].tolist()
+    prefixed = []
+    if prefix is None:
+        prefix = ""
+    for i,s in enumerate(sentences):
+        if s is None or s == "":
+            print(i,s, "text is empty, adding a [space]")
+            s = " "
+        prefixed.append(prefix + s)
+    sentences = prefixed #[prefix + s for s in sentences]
+
+    total_batches = len(sentences)//batch_size
+
     print("embedding", len(sentences), "sentences", "in", total_batches, "batches")
     if starting_batch > 0:
         print("Rerunning starting at batch", starting_batch)
@@ -164,8 +169,6 @@ def embed_debug(parquet_file, model_id, text_column):
     model.load_model()
 
     for i,row in enumerate(df.iterrows()):
-        if(i != 34):
-            continue
         print("batch index:", i)
         print("original index:", row[0])
         text = row[1][text_column]

diff --git a/latentscope/scripts/ingest.py b/latentscope/scripts/ingest.py
@@ -7,21 +7,32 @@
 from latentscope.util import get_data_dir
 
 # TODO make a parquet version of these
-def csv():
+def main():
     parser = argparse.ArgumentParser(description='Ingest a dataset')
     parser.add_argument('id', type=str, help='Dataset id (directory name in data folder)')
-    parser.add_argument('--path', type=str, help='Path to csv file, otherwise assumes input.csv in dataset directory')
+    parser.add_argument('--path', type=str, help='Path to csv or parquet file, otherwise assumes input.csv in dataset directory')
     args = parser.parse_args()
-    ingest(args.id, args.path)
+    ingest_file(args.id, args.path)
 
-def ingest_csv(dataset_id, csv_path):
+def ingest_file(dataset_id, file_path):
     DATA_DIR = get_data_dir()
     directory = os.path.join(DATA_DIR, dataset_id)
-    if not csv_path:
-        csv_path = os.path.join(directory, "input.csv")
-    csv_file = os.path.join(csv_path)
-    print("reading", csv_file)
-    df = pd.read_csv(csv_file)
+    if not file_path:
+        file_path = os.path.join(directory, "input.csv")
+    file_type = file_path.split('.')[-1]
+    print(f"File type detected: {file_type}")
+    file = os.path.join(file_path)
+    print("reading", file)
+    if file_type == "csv":
+        df = pd.read_csv(file)
+    elif file_type == "parquet":
+        df = pd.read_parquet(file)
+    elif file_type == "jsonl":
+        with open(file, 'r') as f:
+            lines = f.readlines()
+            df = pd.DataFrame([json.loads(line) for line in lines])
+    else:
+        raise ValueError(f"Unsupported file type: {file_type}")
     ingest(dataset_id, df)
 
 

diff --git a/latentscope/scripts/scope.py b/latentscope/scripts/scope.py
@@ -0,0 +1,52 @@
+import os
+import re
+import json
+import argparse
+from latentscope.util import get_data_dir
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Setup a scope')
+    parser.add_argument('dataset_id', type=str, help='Dataset id (directory name in data folder)')
+    parser.add_argument('embedding_id', type=str, help='Embedding id')
+    parser.add_argument('umap_id', type=str, help='UMAP id')
+    parser.add_argument('cluster_id', type=str, help='Cluster id')
+    parser.add_argument('cluster_labels_id', type=str, help='Cluster labels id')
+    parser.add_argument('label', type=str, help='Label for the scope')
+    parser.add_argument('description', type=str, help='Description of the scope')
+
+def scope(dataset_id, embedding_id, umap_id, cluster_id, cluster_labels_id, label, description):
+    DATA_DIR = get_data_dir()
+    print("DATA DIR", DATA_DIR)
+    directory = os.path.join(DATA_DIR, dataset_id, "scopes")
+
+    def get_next_scopes_number(dataset):
+        # figure out the latest scope number
+        scopes_files = [f for f in os.listdir(directory) if re.match(r"scopes-\d+\.json", f)]
+        if len(scopes_files) > 0:
+            last_scopes = sorted(scopes_files)[-1]
+            last_scopes_number = int(last_scopes.split("-")[1].split(".")[0])
+            next_scopes_number = last_scopes_number + 1
+        else:
+            next_scopes_number = 1
+        return next_scopes_number
+
+    next_scopes_number = get_next_scopes_number(dataset_id)
+    # make the umap name from the number, zero padded to 3 digits
+    id = f"scopes-{next_scopes_number:03d}"
+    print("RUNNING:", id)
+
+    scope = {
+        "id": id,
+        "embedding_id": embedding_id,
+        "umap_id": umap_id,
+        "cluster_id": cluster_id,
+        "cluster_labels_id": cluster_labels_id,
+        "label": label,
+        "description": description
+    }
+
+    file_path = os.path.join(directory, id + ".json")
+    with open(file_path, 'w') as f:
+        json.dump(scope, f, indent=2)
+    print("wrote scope", id)
diff --git a/latentscope/server/__init__.py b/latentscope/server/__init__.py
@@ -12,4 +12,8 @@ def start():
   # This sets the environment variable expected
   update_data_dir(args.data_dir)
   from .app import serve
-  serve(args.host, args.port, args.debug)
+  serve(args.host, args.port, args.debug)
+
+def serve(host="0.0.0.0", port=5001, debug=False):
+  from .app import serve
+  serve(host, port, debug)
diff --git a/latentscope/server/app.py b/latentscope/server/app.py
@@ -20,10 +20,12 @@
     os.makedirs(DATA_DIR)
 
 # We enable a read only mode of the server
-def str_to_bool(s):
+def check_read_only(s):
+    if s is None:
+        return False
     return s.lower() in ['true', '1', 't', 'y', 'yes']
 # export LATENT_SCOPE_READ_ONLY=1  
-READ_ONLY = str_to_bool(os.getenv("LATENT_SCOPE_READ_ONLY"))
+READ_ONLY = check_read_only(os.getenv("LATENT_SCOPE_READ_ONLY"))
 print("READ ONLY?", READ_ONLY, not READ_ONLY)
 
 # in memory cache of dataframes loaded for each dataset

diff --git a/latentscope/server/jobs.py b/latentscope/server/jobs.py
@@ -104,10 +104,11 @@ def run_ingest():
     dataset_dir = os.path.join(DATA_DIR, dataset)
     if not os.path.exists(dataset_dir):
         os.makedirs(dataset_dir)
-    file.save(os.path.join(dataset_dir, "input.csv"))
+    file_path = os.path.join(dataset_dir, file.filename)
+    file.save(file_path)
 
     job_id = str(uuid.uuid4())
-    command = f'ls-ingest-csv {dataset}'
+    command = f'ls-ingest {dataset} --path={file_path}'
     threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
     return jsonify({"job_id": job_id})