Skip to content

Commit

Permalink
posted the video for 4-4
Browse files Browse the repository at this point in the history
Signed-off-by: Matt Williams <[email protected]>
  • Loading branch information
technovangelist committed Apr 5, 2024
1 parent d0e67e5 commit 404f2b4
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 0 deletions.
2 changes: 2 additions & 0 deletions 2024-04-04-build-rag-with-python/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
my_chroma_data
__pycache__
3 changes: 3 additions & 0 deletions 2024-04-04-build-rag-with-python/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[main]
embedmodel=nomic-embed-text
mainmodel=gemma:2b
24 changes: 24 additions & 0 deletions 2024-04-04-build-rag-with-python/import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import ollama, chromadb, time
from utilities import readtext, getconfig
from mattsollamatools import chunker, chunk_text_by_sentences


chroma = chromadb.HttpClient(host="localhost", port=8000)
chroma.delete_collection("buildragwithpython")
collection = chroma.get_or_create_collection(name="buildragwithpython", metadata={"hnsw:space": "cosine"})

embedmodel = getconfig()["embedmodel"]
starttime = time.time()
with open('sourcedocs.txt') as f:
lines = f.readlines()
for filename in lines:
text = readtext(filename)
chunks = chunk_text_by_sentences(source_text=text, sentences_per_chunk=7, overlap=0 )
print(f"with {len(chunks)} chunks")
for index, chunk in enumerate(chunks):
embed = ollama.embeddings(model=embedmodel, prompt=chunk)['embedding']
print(".", end="", flush=True)
collection.add([filename+str(index)], [embed], documents=[chunk], metadatas={"source": filename})

print("--- %s seconds ---" % (time.time() - starttime))

4 changes: 4 additions & 0 deletions 2024-04-04-build-rag-with-python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ocrmypdf==16.1.2
ollama==0.1.8
python_magic==0.4.27
Requests==2.31.0
21 changes: 21 additions & 0 deletions 2024-04-04-build-rag-with-python/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import ollama, sys, chromadb
from utilities import getconfig

embedmodel = getconfig()["embedmodel"]
mainmodel = getconfig()["mainmodel"]
chroma = chromadb.HttpClient(host="localhost", port=8000)
collection = chroma.get_or_create_collection("buildragwithpython")

query = " ".join(sys.argv[1:])
queryembed = ollama.embeddings(model=embedmodel, prompt=query)['embedding']


relevantdocs = collection.query(query_embeddings=[queryembed], n_results=5)["documents"][0]
docs = "\n\n".join(relevantdocs)
modelquery = f"{query} - Answer that question using the following text as a resource: {docs}"

stream = ollama.generate(model=mainmodel, prompt=modelquery, stream=True)

for chunk in stream:
if chunk["response"]:
print(chunk['response'], end='', flush=True)
13 changes: 13 additions & 0 deletions 2024-04-04-build-rag-with-python/sourcedocs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
../scripts/llava.txt
https://www.macrumors.com/roundup/ipad/
https://www.macrumors.com/roundup/ipad-pro/
https://www.macrumors.com/roundup/macos-sonoma/
https://www.macrumors.com/roundup/ios-17/
https://www.macrumors.com/roundup/iphone-16/
https://www.macrumors.com/roundup/apple-vision-pro/
https://www.macrumors.com/2024/04/03/major-earthquake-in-taiwan/
https://www.macrumors.com/2024/04/03/ios-17-5-third-party-item-tracker-alerts/
https://www.macrumors.com/2024/04/03/ipad-launched-14-years-ago/
https://www.macrumors.com/2024/04/03/ipados-17-5-beta-battery-health-code/
https://www.macrumors.com/2024/04/04/iphone-16-design-showcased-by-dummy-models/
https://www.macrumors.com/2024/04/03/everything-new-in-ios-17-5-beta-1/
61 changes: 61 additions & 0 deletions 2024-04-04-build-rag-with-python/utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import re, os, requests, magic, ollama, string, configparser
from urllib.parse import unquote, urlparse
from bs4 import BeautifulSoup


def get_filename_from_cd(cd):
"""
Get filename from content-disposition
"""
if not cd:
return None
fname = cd.split('filename=')[1]
if fname.lower().startswith(("utf-8''", "utf-8'")):
fname = fname.split("'")[-1]
return unquote(fname)

def download_file(url):
with requests.get(url, stream=True) as r:
r.raise_for_status()
filename = get_filename_from_cd(r.headers.get('content-disposition'))
if not filename:
filename = urlparse(url).geturl().replace('https://', '').replace('/', '-')
filename = 'content/' + filename
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return filename

def readtext(path):
path = path.rstrip()
path = path.replace(' \n', '')
path = path.replace('%0A', '')
if re.match(r'^https?://', path):
filename = download_file(path)
else:

relative_path = path
filename = os.path.abspath(relative_path)

filetype = magic.from_file(filename, mime=True)
print(f"\nEmbedding {filename} as {filetype}")
text = ""
if filetype == 'application/pdf':
print('PDF not supported yet')
if filetype == 'text/plain':
with open(filename, 'rb') as f:
text = f.read().decode('utf-8')
if filetype == 'text/html':
with open(filename, 'rb') as f:
soup = BeautifulSoup(f, 'html.parser')
text = soup.get_text()

if os.path.exists(filename) and filename.find('content/') > -1:
os.remove(filename)

return text

def getconfig():
config = configparser.ConfigParser()
config.read('config.ini')
return dict(config.items("main"))

0 comments on commit 404f2b4

Please sign in to comment.