forked from technovangelist/videoprojects
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Matt Williams <[email protected]>
- Loading branch information
1 parent
d0e67e5
commit 404f2b4
Showing
7 changed files
with
128 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
my_chroma_data | ||
__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[main] | ||
embedmodel=nomic-embed-text | ||
mainmodel=gemma:2b |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import ollama, chromadb, time | ||
from utilities import readtext, getconfig | ||
from mattsollamatools import chunker, chunk_text_by_sentences | ||
|
||
|
||
chroma = chromadb.HttpClient(host="localhost", port=8000) | ||
chroma.delete_collection("buildragwithpython") | ||
collection = chroma.get_or_create_collection(name="buildragwithpython", metadata={"hnsw:space": "cosine"}) | ||
|
||
embedmodel = getconfig()["embedmodel"] | ||
starttime = time.time() | ||
with open('sourcedocs.txt') as f: | ||
lines = f.readlines() | ||
for filename in lines: | ||
text = readtext(filename) | ||
chunks = chunk_text_by_sentences(source_text=text, sentences_per_chunk=7, overlap=0 ) | ||
print(f"with {len(chunks)} chunks") | ||
for index, chunk in enumerate(chunks): | ||
embed = ollama.embeddings(model=embedmodel, prompt=chunk)['embedding'] | ||
print(".", end="", flush=True) | ||
collection.add([filename+str(index)], [embed], documents=[chunk], metadatas={"source": filename}) | ||
|
||
print("--- %s seconds ---" % (time.time() - starttime)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
ocrmypdf==16.1.2 | ||
ollama==0.1.8 | ||
python_magic==0.4.27 | ||
Requests==2.31.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import ollama, sys, chromadb | ||
from utilities import getconfig | ||
|
||
embedmodel = getconfig()["embedmodel"] | ||
mainmodel = getconfig()["mainmodel"] | ||
chroma = chromadb.HttpClient(host="localhost", port=8000) | ||
collection = chroma.get_or_create_collection("buildragwithpython") | ||
|
||
query = " ".join(sys.argv[1:]) | ||
queryembed = ollama.embeddings(model=embedmodel, prompt=query)['embedding'] | ||
|
||
|
||
relevantdocs = collection.query(query_embeddings=[queryembed], n_results=5)["documents"][0] | ||
docs = "\n\n".join(relevantdocs) | ||
modelquery = f"{query} - Answer that question using the following text as a resource: {docs}" | ||
|
||
stream = ollama.generate(model=mainmodel, prompt=modelquery, stream=True) | ||
|
||
for chunk in stream: | ||
if chunk["response"]: | ||
print(chunk['response'], end='', flush=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
../scripts/llava.txt | ||
https://www.macrumors.com/roundup/ipad/ | ||
https://www.macrumors.com/roundup/ipad-pro/ | ||
https://www.macrumors.com/roundup/macos-sonoma/ | ||
https://www.macrumors.com/roundup/ios-17/ | ||
https://www.macrumors.com/roundup/iphone-16/ | ||
https://www.macrumors.com/roundup/apple-vision-pro/ | ||
https://www.macrumors.com/2024/04/03/major-earthquake-in-taiwan/ | ||
https://www.macrumors.com/2024/04/03/ios-17-5-third-party-item-tracker-alerts/ | ||
https://www.macrumors.com/2024/04/03/ipad-launched-14-years-ago/ | ||
https://www.macrumors.com/2024/04/03/ipados-17-5-beta-battery-health-code/ | ||
https://www.macrumors.com/2024/04/04/iphone-16-design-showcased-by-dummy-models/ | ||
https://www.macrumors.com/2024/04/03/everything-new-in-ios-17-5-beta-1/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import re, os, requests, magic, ollama, string, configparser | ||
from urllib.parse import unquote, urlparse | ||
from bs4 import BeautifulSoup | ||
|
||
|
||
def get_filename_from_cd(cd): | ||
""" | ||
Get filename from content-disposition | ||
""" | ||
if not cd: | ||
return None | ||
fname = cd.split('filename=')[1] | ||
if fname.lower().startswith(("utf-8''", "utf-8'")): | ||
fname = fname.split("'")[-1] | ||
return unquote(fname) | ||
|
||
def download_file(url): | ||
with requests.get(url, stream=True) as r: | ||
r.raise_for_status() | ||
filename = get_filename_from_cd(r.headers.get('content-disposition')) | ||
if not filename: | ||
filename = urlparse(url).geturl().replace('https://', '').replace('/', '-') | ||
filename = 'content/' + filename | ||
with open(filename, 'wb') as f: | ||
for chunk in r.iter_content(chunk_size=8192): | ||
f.write(chunk) | ||
return filename | ||
|
||
def readtext(path): | ||
path = path.rstrip() | ||
path = path.replace(' \n', '') | ||
path = path.replace('%0A', '') | ||
if re.match(r'^https?://', path): | ||
filename = download_file(path) | ||
else: | ||
|
||
relative_path = path | ||
filename = os.path.abspath(relative_path) | ||
|
||
filetype = magic.from_file(filename, mime=True) | ||
print(f"\nEmbedding {filename} as {filetype}") | ||
text = "" | ||
if filetype == 'application/pdf': | ||
print('PDF not supported yet') | ||
if filetype == 'text/plain': | ||
with open(filename, 'rb') as f: | ||
text = f.read().decode('utf-8') | ||
if filetype == 'text/html': | ||
with open(filename, 'rb') as f: | ||
soup = BeautifulSoup(f, 'html.parser') | ||
text = soup.get_text() | ||
|
||
if os.path.exists(filename) and filename.find('content/') > -1: | ||
os.remove(filename) | ||
|
||
return text | ||
|
||
def getconfig(): | ||
config = configparser.ConfigParser() | ||
config.read('config.ini') | ||
return dict(config.items("main")) |