Skip to content

Commit

Permalink
Harrison/duckdb (langchain-ai#2064)
Browse files Browse the repository at this point in the history
Co-authored-by: Trent Hauck <[email protected]>
  • Loading branch information
hwchase17 and tshauck authored Mar 28, 2023
1 parent 76ecca4 commit f74a1be
Show file tree
Hide file tree
Showing 4 changed files with 308 additions and 1 deletion.
175 changes: 175 additions & 0 deletions docs/modules/indexes/document_loaders/examples/duckdb.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DuckDB Loader\n",
"\n",
"Load a DuckDB query with one document per row."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import DuckDBLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing example.csv\n"
]
}
],
"source": [
"%%file example.csv\n",
"Team,Payroll\n",
"Nationals,81.34\n",
"Reds,82.20"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"loader = DuckDBLoader(\"SELECT * FROM read_csv_auto('example.csv')\")\n",
"\n",
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Document(page_content='Team: Nationals\\nPayroll: 81.34', metadata={}), Document(page_content='Team: Reds\\nPayroll: 82.2', metadata={})]\n"
]
}
],
"source": [
"print(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specifying Which Columns are Content vs Metadata"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"loader = DuckDBLoader(\n",
" \"SELECT * FROM read_csv_auto('example.csv')\",\n",
" page_content_columns=[\"Team\"],\n",
" metadata_columns=[\"Payroll\"]\n",
")\n",
"\n",
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Document(page_content='Team: Nationals', metadata={'Payroll': 81.34}), Document(page_content='Team: Reds', metadata={'Payroll': 82.2})]\n"
]
}
],
"source": [
"print(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Adding Source to Metadata"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"loader = DuckDBLoader(\n",
" \"SELECT Team, Payroll, Team As source FROM read_csv_auto('example.csv')\",\n",
" metadata_columns=[\"source\"]\n",
")\n",
"\n",
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Document(page_content='Team: Nationals\\nPayroll: 81.34\\nsource: Nationals', metadata={'source': 'Nationals'}), Document(page_content='Team: Reds\\nPayroll: 82.2\\nsource: Reds', metadata={'source': 'Reds'})]\n"
]
}
],
"source": [
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
4 changes: 3 additions & 1 deletion langchain/document_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.duckdb_loader import DuckDBLoader
from langchain.document_loaders.email import UnstructuredEmailLoader
from langchain.document_loaders.evernote import EverNoteLoader
from langchain.document_loaders.facebook_chat import FacebookChatLoader
Expand Down Expand Up @@ -61,7 +62,7 @@
YoutubeLoader,
)

"""Legacy: only for backwards compat. use PyPDFLoader instead"""
# Legacy: only for backwards compat. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader

__all__ = [
Expand Down Expand Up @@ -116,4 +117,5 @@
"AzureBlobStorageFileLoader",
"AzureBlobStorageContainerLoader",
"SitemapLoader",
"DuckDBLoader",
]
74 changes: 74 additions & 0 deletions langchain/document_loaders/duckdb_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Dict, List, Optional, cast

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class DuckDBLoader(BaseLoader):
"""Loads a query result from DuckDB into a list of documents.
Each document represents one row of the result. The `page_content_columns`
are written into the `page_content` of the document. The `metadata_columns`
are written into the `metadata` of the document. By default, all columns
are written into the `page_content` and none into the `metadata`.
"""

def __init__(
self,
query: str,
database: str = ":memory:",
read_only: bool = False,
config: Optional[Dict[str, str]] = None,
page_content_columns: Optional[List[str]] = None,
metadata_columns: Optional[List[str]] = None,
):
self.query = query
self.database = database
self.read_only = read_only
self.config = config or {}
self.page_content_columns = page_content_columns
self.metadata_columns = metadata_columns

def load(self) -> List[Document]:
try:
import duckdb
except ImportError:
raise ValueError(
"Could not import duckdb python package. "
"Please it install it with `pip install duckdb`."
)

docs = []
with duckdb.connect(
database=self.database, read_only=self.read_only, config=self.config
) as con:
query_result = con.execute(self.query)
results = query_result.fetchall()
description = cast(list, query_result.description)
field_names = [c[0] for c in description]

if self.page_content_columns is None:
page_content_columns = field_names
else:
page_content_columns = self.page_content_columns

if self.metadata_columns is None:
metadata_columns = []
else:
metadata_columns = self.metadata_columns

for result in results:
page_content = "\n".join(
f"{column}: {result[field_names.index(column)]}"
for column in page_content_columns
)

metadata = {
column: result[field_names.index(column)]
for column in metadata_columns
}

doc = Document(page_content=page_content, metadata=metadata)
docs.append(doc)

return docs
56 changes: 56 additions & 0 deletions tests/integration_tests/document_loaders/test_duckdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import unittest

from langchain.document_loaders.duckdb_loader import DuckDBLoader

try:
import duckdb # noqa: F401

duckdb_installed = True
except ImportError:
duckdb_installed = False


@unittest.skipIf(not duckdb_installed, "duckdb not installed")
def test_duckdb_loader_no_options() -> None:
"""Test DuckDB loader."""

loader = DuckDBLoader("SELECT 1 AS a, 2 AS b")
docs = loader.load()

assert len(docs) == 1
assert docs[0].page_content == "a: 1\nb: 2"
assert docs[0].metadata == {}


@unittest.skipIf(not duckdb_installed, "duckdb not installed")
def test_duckdb_loader_page_content_columns() -> None:
"""Test DuckDB loader."""

loader = DuckDBLoader(
"SELECT 1 AS a, 2 AS b UNION SELECT 3 AS a, 4 AS b",
page_content_columns=["a"],
)
docs = loader.load()

assert len(docs) == 2
assert docs[0].page_content == "a: 1"
assert docs[0].metadata == {}

assert docs[1].page_content == "a: 3"
assert docs[1].metadata == {}


@unittest.skipIf(not duckdb_installed, "duckdb not installed")
def test_duckdb_loader_metadata_columns() -> None:
"""Test DuckDB loader."""

loader = DuckDBLoader(
"SELECT 1 AS a, 2 AS b",
page_content_columns=["a"],
metadata_columns=["b"],
)
docs = loader.load()

assert len(docs) == 1
assert docs[0].page_content == "a: 1"
assert docs[0].metadata == {"b": 2}

0 comments on commit f74a1be

Please sign in to comment.