Harrison/duckdb (langchain-ai#2064)

Co-authored-by: Trent Hauck <[email protected]>
kevinNejad · Mar 28, 2023 · f74a1be · f74a1be
1 parent 76ecca4
commit f74a1be
Show file tree

Hide file tree

Showing 4 changed files with 308 additions and 1 deletion.
diff --git a/docs/modules/indexes/document_loaders/examples/duckdb.ipynb b/docs/modules/indexes/document_loaders/examples/duckdb.ipynb
@@ -0,0 +1,175 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DuckDB Loader\n",
+    "\n",
+    "Load a DuckDB query with one document per row."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import DuckDBLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing example.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%file example.csv\n",
+    "Team,Payroll\n",
+    "Nationals,81.34\n",
+    "Reds,82.20"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = DuckDBLoader(\"SELECT * FROM read_csv_auto('example.csv')\")\n",
+    "\n",
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Document(page_content='Team: Nationals\\nPayroll: 81.34', metadata={}), Document(page_content='Team: Reds\\nPayroll: 82.2', metadata={})]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Specifying Which Columns are Content vs Metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = DuckDBLoader(\n",
+    "    \"SELECT * FROM read_csv_auto('example.csv')\",\n",
+    "    page_content_columns=[\"Team\"],\n",
+    "    metadata_columns=[\"Payroll\"]\n",
+    ")\n",
+    "\n",
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Document(page_content='Team: Nationals', metadata={'Payroll': 81.34}), Document(page_content='Team: Reds', metadata={'Payroll': 82.2})]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Adding Source to Metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = DuckDBLoader(\n",
+    "    \"SELECT Team, Payroll, Team As source FROM read_csv_auto('example.csv')\",\n",
+    "    metadata_columns=[\"source\"]\n",
+    ")\n",
+    "\n",
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Document(page_content='Team: Nationals\\nPayroll: 81.34\\nsource: Nationals', metadata={'source': 'Nationals'}), Document(page_content='Team: Reds\\nPayroll: 82.2\\nsource: Reds', metadata={'source': 'Reds'})]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
@@ -14,6 +14,7 @@
 from langchain.document_loaders.csv_loader import CSVLoader
 from langchain.document_loaders.dataframe import DataFrameLoader
 from langchain.document_loaders.directory import DirectoryLoader
+from langchain.document_loaders.duckdb_loader import DuckDBLoader
 from langchain.document_loaders.email import UnstructuredEmailLoader
 from langchain.document_loaders.evernote import EverNoteLoader
 from langchain.document_loaders.facebook_chat import FacebookChatLoader
@@ -61,7 +62,7 @@
     YoutubeLoader,
 )
 
-"""Legacy: only for backwards compat. use PyPDFLoader instead"""
+# Legacy: only for backwards compat. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader
 
 __all__ = [
@@ -116,4 +117,5 @@
     "AzureBlobStorageFileLoader",
     "AzureBlobStorageContainerLoader",
     "SitemapLoader",
+    "DuckDBLoader",
 ]
diff --git a/langchain/document_loaders/duckdb_loader.py b/langchain/document_loaders/duckdb_loader.py
@@ -0,0 +1,74 @@
+from typing import Dict, List, Optional, cast
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class DuckDBLoader(BaseLoader):
+    """Loads a query result from DuckDB into a list of documents.
+
+    Each document represents one row of the result. The `page_content_columns`
+    are written into the `page_content` of the document. The `metadata_columns`
+    are written into the `metadata` of the document. By default, all columns
+    are written into the `page_content` and none into the `metadata`.
+    """
+
+    def __init__(
+        self,
+        query: str,
+        database: str = ":memory:",
+        read_only: bool = False,
+        config: Optional[Dict[str, str]] = None,
+        page_content_columns: Optional[List[str]] = None,
+        metadata_columns: Optional[List[str]] = None,
+    ):
+        self.query = query
+        self.database = database
+        self.read_only = read_only
+        self.config = config or {}
+        self.page_content_columns = page_content_columns
+        self.metadata_columns = metadata_columns
+
+    def load(self) -> List[Document]:
+        try:
+            import duckdb
+        except ImportError:
+            raise ValueError(
+                "Could not import duckdb python package. "
+                "Please it install it with `pip install duckdb`."
+            )
+
+        docs = []
+        with duckdb.connect(
+            database=self.database, read_only=self.read_only, config=self.config
+        ) as con:
+            query_result = con.execute(self.query)
+            results = query_result.fetchall()
+            description = cast(list, query_result.description)
+            field_names = [c[0] for c in description]
+
+            if self.page_content_columns is None:
+                page_content_columns = field_names
+            else:
+                page_content_columns = self.page_content_columns
+
+            if self.metadata_columns is None:
+                metadata_columns = []
+            else:
+                metadata_columns = self.metadata_columns
+
+            for result in results:
+                page_content = "\n".join(
+                    f"{column}: {result[field_names.index(column)]}"
+                    for column in page_content_columns
+                )
+
+                metadata = {
+                    column: result[field_names.index(column)]
+                    for column in metadata_columns
+                }
+
+                doc = Document(page_content=page_content, metadata=metadata)
+                docs.append(doc)
+
+        return docs
diff --git a/tests/integration_tests/document_loaders/test_duckdb.py b/tests/integration_tests/document_loaders/test_duckdb.py
@@ -0,0 +1,56 @@
+import unittest
+
+from langchain.document_loaders.duckdb_loader import DuckDBLoader
+
+try:
+    import duckdb  # noqa: F401
+
+    duckdb_installed = True
+except ImportError:
+    duckdb_installed = False
+
+
+@unittest.skipIf(not duckdb_installed, "duckdb not installed")
+def test_duckdb_loader_no_options() -> None:
+    """Test DuckDB loader."""
+
+    loader = DuckDBLoader("SELECT 1 AS a, 2 AS b")
+    docs = loader.load()
+
+    assert len(docs) == 1
+    assert docs[0].page_content == "a: 1\nb: 2"
+    assert docs[0].metadata == {}
+
+
+@unittest.skipIf(not duckdb_installed, "duckdb not installed")
+def test_duckdb_loader_page_content_columns() -> None:
+    """Test DuckDB loader."""
+
+    loader = DuckDBLoader(
+        "SELECT 1 AS a, 2 AS b UNION SELECT 3 AS a, 4 AS b",
+        page_content_columns=["a"],
+    )
+    docs = loader.load()
+
+    assert len(docs) == 2
+    assert docs[0].page_content == "a: 1"
+    assert docs[0].metadata == {}
+
+    assert docs[1].page_content == "a: 3"
+    assert docs[1].metadata == {}
+
+
+@unittest.skipIf(not duckdb_installed, "duckdb not installed")
+def test_duckdb_loader_metadata_columns() -> None:
+    """Test DuckDB loader."""
+
+    loader = DuckDBLoader(
+        "SELECT 1 AS a, 2 AS b",
+        page_content_columns=["a"],
+        metadata_columns=["b"],
+    )
+    docs = loader.load()
+
+    assert len(docs) == 1
+    assert docs[0].page_content == "a: 1"
+    assert docs[0].metadata == {"b": 2}