Skip to content

Commit

Permalink
Add chunker, improve modularity
Browse files Browse the repository at this point in the history
  • Loading branch information
lumpenspace committed Oct 21, 2023
1 parent 2ef341a commit 64cd3d0
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 17 deletions.
51 changes: 51 additions & 0 deletions src/chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import json
from typing import List, Dict
from openai import GPT3Encoder

MAX_EMBEDDING_LENGTH = 2048

def split_into_chunks(blog_posts: List[Dict], encoder: GPT3Encoder):
for post in blog_posts:
# Split the post content into chunks
content_tokens = encoder.encode(post['content'])

chunks = []
chunk_start = 0
while chunk_start < len(content_tokens):
chunk_end = min(chunk_start + MAX_EMBEDDING_LENGTH, len(content_tokens))
# Find the closest newline before the length limit
while chunk_end > chunk_start and content_tokens[chunk_end] != encoder.encode('\n')[0]:
chunk_end -= 1
chunks.append(content_tokens[chunk_start:chunk_end])
chunk_start = chunk_end

# Add title, date and part to each chunk and store its embedding in the Chroma DB
for j, chunk in enumerate(chunks):
document = encoder.decode(chunk)
metadata = {
"title": post['title'],
"url": post['url'],
"part": j+1,
"total_parts": len(chunks),
"date": post['date']
}
yield document, metadata

def main(name):
encoder = GPT3Encoder()
sourcefile = f'{name}.json'
outputfile = f'{name}_chunked.jsonl'

with open(sourcefile, 'r') as f:
blog_posts = json.load(f)

with open(outputfile, 'w') as f:
for document, metadata in split_into_chunks(blog_posts, encoder):
f.write(json.dumps({"document": document, "metadata": metadata}) + '\n')

if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Split blog posts into chunks.')
parser.add_argument('name', help='The name of the blog to process.')
args = parser.parse_args()
main(args.name)
28 changes: 28 additions & 0 deletions src/embeddings_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# helper_functions.py
import json
from openai import Embedding
from chromadb import Client

def get_embedding(text:str):
return Embedding.create(text=text)

def store_grounding_embeddings(name: str):
# Initialize the Chroma client and get the collection
chroma_client = Client()
collection = chroma_client.get_collection(name)

# Determine the source file name based on the name parameter
sourcefile = f'{name}_chunked.jsonl'

# Read the chunks and their metadata from the source file
with open(sourcefile, 'r') as f:
for line in f:
chunk = json.loads(line)
document = chunk['document']
metadata = chunk['metadata']

# Get the OpenAI embedding for the document
embedding = get_embedding(document)

# Store the document, its embedding, and its metadata in Chroma
collection.add(ids=[f"{metadata['title']}_part_{metadata['part']}"], embeddings=[embedding], documents=[document], metadatas=[metadata])
29 changes: 12 additions & 17 deletions src/memories.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List, Tuple, Dict
from concurrent.futures import ThreadPoolExecutor
from openai import ChatCompletion, Embedding, GPT3Encoder
import logging
from openai import Embedding, GPT3Encoder
import json
import chromadb
from ..prompts import summarize_memory

Expand Down Expand Up @@ -35,21 +35,16 @@ def summarize_helpful_memories(self, question:str, similar_extracts:List[Tuple[s
summaries = list(executor.map(summarize_memory, similar_extracts, [question]*len(similar_extracts)))
summaries = [summary for summary in summaries if summary != "skip"]
return summaries

def store_grounding_embeddings(self, name: str):
sourcefile = f'{name}_chunked.jsonl'

def store_grounding_embeddings(self, blog_posts: List[Dict]):
# Iterate over the blog posts
for post in blog_posts:
# Reserve some tokens for the title and date
reserved_tokens = self.encoder.encode(f"Title: {post['title']}\nDate: {post['date']}\nPart: 1\n")
chunk_size = MAX_EMBEDDING_LENGTH - len(reserved_tokens)
with open(sourcefile, 'r') as f:
for line in f:
chunk = json.loads(line)
document = chunk['document']
metadata = chunk['metadata']

# Split the post content into chunks
content_tokens = self.encoder.encode(post['content'])
chunks = [content_tokens[i:i+chunk_size] for i in range(0, len(content_tokens), chunk_size)]
embedding = self.get_embedding(document)

logging.info(f"Storing {len(chunks)} chunks for post {post['title']}")

# Add title, date and part to each chunk and store its embedding in the Chroma DB
for j, chunk in enumerate(chunks):
document = f"Title: {post['title']}\nDate: {post['date']}\nPart: {j+1}\n{self.encoder.decode(chunk)}"
self.collection.add(documents=[document])
self.collection.add(ids=[f"{metadata['title']}_part_{metadata['part']}"], embeddings=[embedding], documents=[document], metadatas=[metadata])
55 changes: 55 additions & 0 deletions src/ssjl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
import json
import requests
import argparse
from bs4 import BeautifulSoup
from time import sleep
from random import randrange

def fetch_json(url, params):
endpoint = "%s/api/v1/archive" % url
response = requests.get(endpoint, params=params)
response.raise_for_status()
return response.json()

def fetch_html(url):
response = requests.get(url)
return response.text

def fetch_and_parse(url):
limit = 12
offset = 0
results_len = 1
while results_len != 0:
params = {'limit': limit, 'offset': offset}
entries = fetch_json(url, params=params)
for item in entries:
Link = item['canonical_url']
Title = item['title']
Date = item['post_date']
Html = fetch_html(Link)
soup = BeautifulSoup(Html, 'html.parser')
content = soup.find('div', {'class': 'markup'})
if content:
yield {
'title': Title,
'link': Link,
'date': Date,
'content': content.text,
}
timeout = randrange(2, 20)
sleep(timeout)
offset = limit + offset
results_len = len(entries)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Fetch posts from a Substack blog.')
parser.add_argument('blog', help='The name of the Substack blog to fetch posts from.')
args = parser.parse_args()

url = f'https://{args.blog}.substack.com'
filename = 'data/' + url.replace('https://', '').replace('.', '-') + '.jsonl'
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w') as f:
for post in fetch_and_parse(url):
f.write(json.dumps(post) + '\n')

0 comments on commit 64cd3d0

Please sign in to comment.