Add chunker, improve modularity

lumpenspace · Oct 21, 2023 · 64cd3d0 · 64cd3d0
1 parent 2ef341a
commit 64cd3d0
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 17 deletions.
diff --git a/src/chunker.py b/src/chunker.py
@@ -0,0 +1,51 @@
+import json
+from typing import List, Dict
+from openai import GPT3Encoder
+
+MAX_EMBEDDING_LENGTH = 2048
+
+def split_into_chunks(blog_posts: List[Dict], encoder: GPT3Encoder):
+    for post in blog_posts:
+        # Split the post content into chunks
+        content_tokens = encoder.encode(post['content'])
+
+        chunks = []
+        chunk_start = 0
+        while chunk_start < len(content_tokens):
+            chunk_end = min(chunk_start + MAX_EMBEDDING_LENGTH, len(content_tokens))
+            # Find the closest newline before the length limit
+            while chunk_end > chunk_start and content_tokens[chunk_end] != encoder.encode('\n')[0]:
+                chunk_end -= 1
+            chunks.append(content_tokens[chunk_start:chunk_end])
+            chunk_start = chunk_end
+
+        # Add title, date and part to each chunk and store its embedding in the Chroma DB
+        for j, chunk in enumerate(chunks):
+            document = encoder.decode(chunk)
+            metadata = {
+                "title": post['title'],
+                "url": post['url'],
+                "part": j+1,
+                "total_parts": len(chunks),
+                "date": post['date']
+            }
+            yield document, metadata
+
+def main(name):
+    encoder = GPT3Encoder()
+    sourcefile = f'{name}.json'
+    outputfile = f'{name}_chunked.jsonl'
+
+    with open(sourcefile, 'r') as f:
+        blog_posts = json.load(f)
+
+    with open(outputfile, 'w') as f:
+        for document, metadata in split_into_chunks(blog_posts, encoder):
+            f.write(json.dumps({"document": document, "metadata": metadata}) + '\n')
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Split blog posts into chunks.')
+    parser.add_argument('name', help='The name of the blog to process.')
+    args = parser.parse_args()
+    main(args.name)
diff --git a/src/embeddings_helpers.py b/src/embeddings_helpers.py
@@ -0,0 +1,28 @@
+# helper_functions.py
+import json
+from openai import Embedding
+from chromadb import Client
+
+def get_embedding(text:str):
+    return Embedding.create(text=text)
+
+def store_grounding_embeddings(name: str):
+    # Initialize the Chroma client and get the collection
+    chroma_client = Client()
+    collection = chroma_client.get_collection(name)
+
+    # Determine the source file name based on the name parameter
+    sourcefile = f'{name}_chunked.jsonl'
+
+    # Read the chunks and their metadata from the source file
+    with open(sourcefile, 'r') as f:
+        for line in f:
+            chunk = json.loads(line)
+            document = chunk['document']
+            metadata = chunk['metadata']
+
+            # Get the OpenAI embedding for the document
+            embedding = get_embedding(document)
+
+            # Store the document, its embedding, and its metadata in Chroma
+            collection.add(ids=[f"{metadata['title']}_part_{metadata['part']}"], embeddings=[embedding], documents=[document], metadatas=[metadata])
diff --git a/src/memories.py b/src/memories.py
@@ -1,7 +1,7 @@
 from typing import List, Tuple, Dict
 from concurrent.futures import ThreadPoolExecutor
-from openai import ChatCompletion, Embedding, GPT3Encoder
-import logging
+from openai import Embedding, GPT3Encoder
+import json
 import chromadb
 from ..prompts import summarize_memory
 
@@ -35,21 +35,16 @@ def summarize_helpful_memories(self, question:str, similar_extracts:List[Tuple[s
             summaries = list(executor.map(summarize_memory, similar_extracts, [question]*len(similar_extracts)))
             summaries = [summary for summary in summaries if summary != "skip"]
         return summaries
+
+    def store_grounding_embeddings(self, name: str):
+        sourcefile = f'{name}_chunked.jsonl'
 
-    def store_grounding_embeddings(self, blog_posts: List[Dict]):
-        # Iterate over the blog posts
-        for post in blog_posts:
-            # Reserve some tokens for the title and date
-            reserved_tokens = self.encoder.encode(f"Title: {post['title']}\nDate: {post['date']}\nPart: 1\n")
-            chunk_size = MAX_EMBEDDING_LENGTH - len(reserved_tokens)
+        with open(sourcefile, 'r') as f:
+            for line in f:
+                chunk = json.loads(line)
+                document = chunk['document']
+                metadata = chunk['metadata']
 
-            # Split the post content into chunks
-            content_tokens = self.encoder.encode(post['content'])
-            chunks = [content_tokens[i:i+chunk_size] for i in range(0, len(content_tokens), chunk_size)]
+                embedding = self.get_embedding(document)
 
-            logging.info(f"Storing {len(chunks)} chunks for post {post['title']}")
-
-            # Add title, date and part to each chunk and store its embedding in the Chroma DB
-            for j, chunk in enumerate(chunks):
-                document = f"Title: {post['title']}\nDate: {post['date']}\nPart: {j+1}\n{self.encoder.decode(chunk)}"
-                self.collection.add(documents=[document])
+                self.collection.add(ids=[f"{metadata['title']}_part_{metadata['part']}"], embeddings=[embedding], documents=[document], metadatas=[metadata])
diff --git a/src/ssjl.py b/src/ssjl.py
@@ -0,0 +1,55 @@
+import os
+import json
+import requests
+import argparse
+from bs4 import BeautifulSoup
+from time import sleep
+from random import randrange
+
+def fetch_json(url, params):
+    endpoint = "%s/api/v1/archive" % url
+    response = requests.get(endpoint, params=params)
+    response.raise_for_status()
+    return response.json()
+
+def fetch_html(url):
+    response = requests.get(url)
+    return response.text
+
+def fetch_and_parse(url):
+    limit = 12
+    offset = 0
+    results_len = 1
+    while results_len != 0:
+        params = {'limit': limit, 'offset': offset}
+        entries = fetch_json(url, params=params)
+        for item in entries:
+            Link = item['canonical_url']
+            Title = item['title']
+            Date = item['post_date']
+            Html = fetch_html(Link)
+            soup = BeautifulSoup(Html, 'html.parser')
+            content = soup.find('div', {'class': 'markup'})
+            if content:
+                yield {
+                    'title': Title,
+                    'link': Link,
+                    'date': Date,
+                    'content': content.text,
+                }
+            timeout = randrange(2, 20)
+            sleep(timeout)
+        offset = limit + offset
+        results_len = len(entries)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Fetch posts from a Substack blog.')
+    parser.add_argument('blog', help='The name of the Substack blog to fetch posts from.')
+    args = parser.parse_args()
+
+    url = f'https://{args.blog}.substack.com'
+    filename = 'data/' + url.replace('https://', '').replace('.', '-') + '.jsonl'
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with open(filename, 'w') as f:
+        for post in fetch_and_parse(url):
+            f.write(json.dumps(post) + '\n')