-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2ef341a
commit 64cd3d0
Showing
4 changed files
with
146 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import json | ||
from typing import List, Dict | ||
from openai import GPT3Encoder | ||
|
||
MAX_EMBEDDING_LENGTH = 2048 | ||
|
||
def split_into_chunks(blog_posts: List[Dict], encoder: GPT3Encoder): | ||
for post in blog_posts: | ||
# Split the post content into chunks | ||
content_tokens = encoder.encode(post['content']) | ||
|
||
chunks = [] | ||
chunk_start = 0 | ||
while chunk_start < len(content_tokens): | ||
chunk_end = min(chunk_start + MAX_EMBEDDING_LENGTH, len(content_tokens)) | ||
# Find the closest newline before the length limit | ||
while chunk_end > chunk_start and content_tokens[chunk_end] != encoder.encode('\n')[0]: | ||
chunk_end -= 1 | ||
chunks.append(content_tokens[chunk_start:chunk_end]) | ||
chunk_start = chunk_end | ||
|
||
# Add title, date and part to each chunk and store its embedding in the Chroma DB | ||
for j, chunk in enumerate(chunks): | ||
document = encoder.decode(chunk) | ||
metadata = { | ||
"title": post['title'], | ||
"url": post['url'], | ||
"part": j+1, | ||
"total_parts": len(chunks), | ||
"date": post['date'] | ||
} | ||
yield document, metadata | ||
|
||
def main(name): | ||
encoder = GPT3Encoder() | ||
sourcefile = f'{name}.json' | ||
outputfile = f'{name}_chunked.jsonl' | ||
|
||
with open(sourcefile, 'r') as f: | ||
blog_posts = json.load(f) | ||
|
||
with open(outputfile, 'w') as f: | ||
for document, metadata in split_into_chunks(blog_posts, encoder): | ||
f.write(json.dumps({"document": document, "metadata": metadata}) + '\n') | ||
|
||
if __name__ == "__main__": | ||
import argparse | ||
parser = argparse.ArgumentParser(description='Split blog posts into chunks.') | ||
parser.add_argument('name', help='The name of the blog to process.') | ||
args = parser.parse_args() | ||
main(args.name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# helper_functions.py | ||
import json | ||
from openai import Embedding | ||
from chromadb import Client | ||
|
||
def get_embedding(text:str): | ||
return Embedding.create(text=text) | ||
|
||
def store_grounding_embeddings(name: str): | ||
# Initialize the Chroma client and get the collection | ||
chroma_client = Client() | ||
collection = chroma_client.get_collection(name) | ||
|
||
# Determine the source file name based on the name parameter | ||
sourcefile = f'{name}_chunked.jsonl' | ||
|
||
# Read the chunks and their metadata from the source file | ||
with open(sourcefile, 'r') as f: | ||
for line in f: | ||
chunk = json.loads(line) | ||
document = chunk['document'] | ||
metadata = chunk['metadata'] | ||
|
||
# Get the OpenAI embedding for the document | ||
embedding = get_embedding(document) | ||
|
||
# Store the document, its embedding, and its metadata in Chroma | ||
collection.add(ids=[f"{metadata['title']}_part_{metadata['part']}"], embeddings=[embedding], documents=[document], metadatas=[metadata]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import os | ||
import json | ||
import requests | ||
import argparse | ||
from bs4 import BeautifulSoup | ||
from time import sleep | ||
from random import randrange | ||
|
||
def fetch_json(url, params): | ||
endpoint = "%s/api/v1/archive" % url | ||
response = requests.get(endpoint, params=params) | ||
response.raise_for_status() | ||
return response.json() | ||
|
||
def fetch_html(url): | ||
response = requests.get(url) | ||
return response.text | ||
|
||
def fetch_and_parse(url): | ||
limit = 12 | ||
offset = 0 | ||
results_len = 1 | ||
while results_len != 0: | ||
params = {'limit': limit, 'offset': offset} | ||
entries = fetch_json(url, params=params) | ||
for item in entries: | ||
Link = item['canonical_url'] | ||
Title = item['title'] | ||
Date = item['post_date'] | ||
Html = fetch_html(Link) | ||
soup = BeautifulSoup(Html, 'html.parser') | ||
content = soup.find('div', {'class': 'markup'}) | ||
if content: | ||
yield { | ||
'title': Title, | ||
'link': Link, | ||
'date': Date, | ||
'content': content.text, | ||
} | ||
timeout = randrange(2, 20) | ||
sleep(timeout) | ||
offset = limit + offset | ||
results_len = len(entries) | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Fetch posts from a Substack blog.') | ||
parser.add_argument('blog', help='The name of the Substack blog to fetch posts from.') | ||
args = parser.parse_args() | ||
|
||
url = f'https://{args.blog}.substack.com' | ||
filename = 'data/' + url.replace('https://', '').replace('.', '-') + '.jsonl' | ||
os.makedirs(os.path.dirname(filename), exist_ok=True) | ||
with open(filename, 'w') as f: | ||
for post in fetch_and_parse(url): | ||
f.write(json.dumps(post) + '\n') |