posted the video for 4-4

Signed-off-by: Matt Williams <[email protected]>
cainstudios · Apr 5, 2024 · 404f2b4 · 404f2b4
1 parent d0e67e5
commit 404f2b4
Show file tree

Hide file tree

Showing 7 changed files with 128 additions and 0 deletions.
diff --git a/2024-04-04-build-rag-with-python/.gitignore b/2024-04-04-build-rag-with-python/.gitignore
@@ -0,0 +1,2 @@
+my_chroma_data
+__pycache__
diff --git a/2024-04-04-build-rag-with-python/config.ini b/2024-04-04-build-rag-with-python/config.ini
@@ -0,0 +1,3 @@
+[main]
+embedmodel=nomic-embed-text
+mainmodel=gemma:2b
diff --git a/2024-04-04-build-rag-with-python/import.py b/2024-04-04-build-rag-with-python/import.py
@@ -0,0 +1,24 @@
+import ollama, chromadb, time
+from utilities import readtext, getconfig
+from mattsollamatools import chunker, chunk_text_by_sentences
+
+
+chroma = chromadb.HttpClient(host="localhost", port=8000)
+chroma.delete_collection("buildragwithpython")
+collection = chroma.get_or_create_collection(name="buildragwithpython", metadata={"hnsw:space": "cosine"})
+
+embedmodel = getconfig()["embedmodel"]
+starttime = time.time()
+with open('sourcedocs.txt') as f:
+  lines = f.readlines()
+  for filename in lines:
+    text = readtext(filename)
+    chunks = chunk_text_by_sentences(source_text=text, sentences_per_chunk=7, overlap=0 )
+    print(f"with {len(chunks)} chunks")
+    for index, chunk in enumerate(chunks):
+      embed = ollama.embeddings(model=embedmodel, prompt=chunk)['embedding']
+      print(".", end="", flush=True)
+      collection.add([filename+str(index)], [embed], documents=[chunk], metadatas={"source": filename})
+
+print("--- %s seconds ---" % (time.time() - starttime))
+
diff --git a/2024-04-04-build-rag-with-python/requirements.txt b/2024-04-04-build-rag-with-python/requirements.txt
@@ -0,0 +1,4 @@
+ocrmypdf==16.1.2
+ollama==0.1.8
+python_magic==0.4.27
+Requests==2.31.0
diff --git a/2024-04-04-build-rag-with-python/search.py b/2024-04-04-build-rag-with-python/search.py
@@ -0,0 +1,21 @@
+import ollama, sys, chromadb
+from utilities import getconfig
+
+embedmodel = getconfig()["embedmodel"]
+mainmodel = getconfig()["mainmodel"]
+chroma = chromadb.HttpClient(host="localhost", port=8000)
+collection = chroma.get_or_create_collection("buildragwithpython")
+
+query = " ".join(sys.argv[1:])
+queryembed = ollama.embeddings(model=embedmodel, prompt=query)['embedding']
+
+
+relevantdocs = collection.query(query_embeddings=[queryembed], n_results=5)["documents"][0]
+docs = "\n\n".join(relevantdocs)
+modelquery = f"{query} - Answer that question using the following text as a resource: {docs}"
+
+stream = ollama.generate(model=mainmodel, prompt=modelquery, stream=True)
+
+for chunk in stream:
+  if chunk["response"]:
+    print(chunk['response'], end='', flush=True)
diff --git a/2024-04-04-build-rag-with-python/sourcedocs.txt b/2024-04-04-build-rag-with-python/sourcedocs.txt
@@ -0,0 +1,13 @@
+../scripts/llava.txt 
+https://www.macrumors.com/roundup/ipad/
+https://www.macrumors.com/roundup/ipad-pro/
+https://www.macrumors.com/roundup/macos-sonoma/
+https://www.macrumors.com/roundup/ios-17/
+https://www.macrumors.com/roundup/iphone-16/
+https://www.macrumors.com/roundup/apple-vision-pro/
+https://www.macrumors.com/2024/04/03/major-earthquake-in-taiwan/
+https://www.macrumors.com/2024/04/03/ios-17-5-third-party-item-tracker-alerts/
+https://www.macrumors.com/2024/04/03/ipad-launched-14-years-ago/
+https://www.macrumors.com/2024/04/03/ipados-17-5-beta-battery-health-code/
+https://www.macrumors.com/2024/04/04/iphone-16-design-showcased-by-dummy-models/
+https://www.macrumors.com/2024/04/03/everything-new-in-ios-17-5-beta-1/
diff --git a/2024-04-04-build-rag-with-python/utilities.py b/2024-04-04-build-rag-with-python/utilities.py
@@ -0,0 +1,61 @@
+import re, os, requests, magic, ollama, string, configparser
+from urllib.parse import unquote, urlparse
+from bs4 import BeautifulSoup
+
+
+def get_filename_from_cd(cd):
+    """
+    Get filename from content-disposition
+    """
+    if not cd:
+        return None
+    fname = cd.split('filename=')[1]
+    if fname.lower().startswith(("utf-8''", "utf-8'")):
+        fname = fname.split("'")[-1]
+    return unquote(fname)
+
+def download_file(url):
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        filename = get_filename_from_cd(r.headers.get('content-disposition'))
+        if not filename:
+            filename = urlparse(url).geturl().replace('https://', '').replace('/', '-')
+        filename = 'content/' + filename
+        with open(filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+        return filename
+
+def readtext(path):
+  path = path.rstrip()
+  path = path.replace(' \n', '')
+  path = path.replace('%0A', '')
+  if re.match(r'^https?://', path):
+    filename = download_file(path)
+  else:
+
+    relative_path = path
+    filename = os.path.abspath(relative_path)
+
+  filetype = magic.from_file(filename, mime=True)
+  print(f"\nEmbedding {filename} as {filetype}")
+  text = ""
+  if filetype == 'application/pdf':
+    print('PDF not supported yet')
+  if filetype == 'text/plain':
+    with open(filename, 'rb') as f:
+      text = f.read().decode('utf-8')
+  if filetype == 'text/html':
+    with open(filename, 'rb') as f:
+      soup = BeautifulSoup(f, 'html.parser')
+      text = soup.get_text()
+
+  if os.path.exists(filename) and filename.find('content/') > -1:
+    os.remove(filename) 
+
+  return text
+
+def getconfig():
+  config = configparser.ConfigParser()
+  config.read('config.ini')
+  return dict(config.items("main"))