Skip to content

Commit

Permalink
Merge pull request PromtEngineer#75 from teleprint-me/issue-72-ingest…
Browse files Browse the repository at this point in the history
…-device-options

Issue 72: Refactor document loading, add device selection, and update README
  • Loading branch information
PromtEngineer authored Jun 7, 2023
2 parents f213303 + 4d19de2 commit 8cdeba7
Show file tree
Hide file tree
Showing 9 changed files with 544 additions and 134 deletions.
166 changes: 163 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,164 @@
# Ignore vscode
/.vscode
/DB
/venv
/.idea
/__pycache__/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,19 @@ The current default file types are .txt, .pdf, .csv, and .xlsx, if you want to u
Run the following command to ingest all the data.

```shell
python ingest.py
python ingest.py # defaults to cuda
```

Use the device type argument to specify a given device.

```sh
python ingest.py --device_type cpu
```

Use help for a full list of supported devices.

```sh
python ingest.py --help
```

It will create an index containing the local vectorstore. Will take time, depending on the size of your documents.
Expand Down
20 changes: 19 additions & 1 deletion constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
import os

# from dotenv import load_dotenv
from chromadb.config import Settings

# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
from langchain.document_loaders import (
CSVLoader,
PDFMinerLoader,
TextLoader,
UnstructuredExcelLoader,
)

# load_dotenv()
ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))

Expand All @@ -15,4 +24,13 @@
chroma_db_impl='duckdb+parquet',
persist_directory=PERSIST_DIRECTORY,
anonymized_telemetry=False
)
)

# https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
DOCUMENT_MAP = {
".txt": TextLoader,
".pdf": PDFMinerLoader,
".csv": CSVLoader,
".xls": UnstructuredExcelLoader,
".xlxs": UnstructuredExcelLoader
}
102 changes: 58 additions & 44 deletions ingest.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,70 @@
import os
import click
from typing import List
from utils import xlxs_to_csv
from langchain.document_loaders import TextLoader, PDFMinerLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

import click
from langchain.docstore.document import Document
from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from constants import (CHROMA_SETTINGS, DOCUMENT_MAP, PERSIST_DIRECTORY,
SOURCE_DIRECTORY)


def load_single_document(file_path: str) -> Document:
# Loads a single document from a file path
if file_path.endswith(".txt"):
loader = TextLoader(file_path, encoding="utf8")
elif file_path.endswith(".pdf"):
loader = PDFMinerLoader(file_path)
elif file_path.endswith(".csv"):
loader = CSVLoader(file_path)
file_extension = os.path.splitext(file_path)[1]
loader_class = DOCUMENT_MAP.get(file_extension)
if loader_class:
loader = loader_class(file_path)
else:
raise ValueError("Document type is undefined")
return loader.load()[0]


def load_documents(source_dir: str) -> List[Document]:
# Loads all documents from source documents directory
# Loads all documents from the source documents directory
all_files = os.listdir(source_dir)
docs = []
for file_path in all_files:
if file_path[-4:] == 'xlsx':
for doc in xlxs_to_csv(f"{source_dir}/{file_path}"):
docs.append(load_single_document(doc))
elif file_path[-4:] in ['.txt', '.pdf', '.csv']:
docs.append(load_single_document(f"{source_dir}/{file_path}"))
file_extension = os.path.splitext(file_path)[1]
source_file_path = os.path.join(source_dir, file_path)
if file_extension in DOCUMENT_MAP.keys():
docs.append(load_single_document(source_file_path))
return docs
# return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if
# file_path[-4:] in ['.txt', '.pdf', '.csv']]


# @click.command()
# @click.option('--device_type', default='gpu', help='device to run on, select gpu or cpu')
# def main(device_type, ):
# # load the instructorEmbeddings
# if device_type in ['cpu', 'CPU']:
# device='cpu'
# else:
# device='cuda'


@click.command()
@click.option('--device_type', default='cuda', help='device to run on, select gpu, cpu or mps')
def main(device_type, ):
# load the instructorEmbeddings
if device_type in ['cpu', 'CPU']:
device='cpu'
elif device_type in ['mps', 'MPS']:
device='mps'
else:
device='cuda'

#  Load documents and split in chunks
@click.option(
"--device_type",
default="cuda",
type=click.Choice(
[
"cpu",
"cuda",
"ipu",
"xpu",
"mkldnn",
"opengl",
"opencl",
"ideep",
"hip",
"ve",
"fpga",
"ort",
"xla",
"lazy",
"vulkan",
"mps",
"meta",
"hpu",
"mtia",
]
),
help="Device to run on. (Default is cuda)",
)
def main(device_type):
# Load documents and split in chunks
print(f"Loading documents from {SOURCE_DIRECTORY}")
documents = load_documents(SOURCE_DIRECTORY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
Expand All @@ -66,10 +73,17 @@ def main(device_type, ):
print(f"Split into {len(texts)} chunks of text")

# Create embeddings
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
model_kwargs={"device": device})
embeddings = HuggingFaceInstructEmbeddings(
model_name="hkunlp/instructor-xl",
model_kwargs={"device": device_type},
)

db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY, client_settings=CHROMA_SETTINGS)
db = Chroma.from_documents(
texts,
embeddings,
persist_directory=PERSIST_DIRECTORY,
client_settings=CHROMA_SETTINGS,
)
db.persist()
db = None

Expand Down
Loading

0 comments on commit 8cdeba7

Please sign in to comment.