Adding doctsrings

up1 · Sep 3, 2024 · 4f74102 · 4f74102
1 parent 47c91ed
commit 4f74102
Showing 1 changed file with 40 additions and 10 deletions.
diff --git a/notebooks/rag/self_querying_mongodb_unstructured_langgraph.ipynb b/notebooks/rag/self_querying_mongodb_unstructured_langgraph.ipynb
@@ -200,10 +200,10 @@
         "id": "4XAZrBVM9eqV"
       },
       "source": [
-        "## Step 3: Partition, chunk and embed PDF files from S3\n",
+        "## Step 3: Partition, chunk and embed PDF files\n",
         "\n",
         "Let's set up the PDF preprocessing pipeline with Unstructured. The pipeline will:\n",
-        "1. Ingest data from an S3 bucket\n",
+        "1. Ingest data from an S3 bucket/local directory\n",
         "2. Partition documents: extract text and metadata, split the documents into document elements, such as titles, paragraphs (narrative text), tables, images, lists, etc. Learn more about document elements in [Unstructured documentation])https://docs.unstructured.io/api-reference/api-services/document-elements).\n",
         "3. Chunk the documents.\n",
         "4. Embed the documents with the [`BAAI/bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) embedding model the Hugging Face Hub.\n",
@@ -251,7 +251,7 @@
       },
       "outputs": [],
       "source": [
-        "work_dir = \"/content/temp\""
+        "WORK_DIR = \"/content/temp\""
       ]
     },
     {
@@ -268,7 +268,7 @@
       "source": [
         "Pipeline.from_configs(\n",
         "    context=ProcessorConfig(\n",
-        "        verbose=True, tqdm=True, num_processes=5, work_dir=work_dir\n",
+        "        verbose=True, tqdm=True, num_processes=5, work_dir=WORK_DIR\n",
         "    ),\n",
         "    indexer_config=S3IndexerConfig(remote_url=AWS_S3_NAME),\n",
         "    downloader_config=S3DownloaderConfig(),\n",
@@ -341,6 +341,15 @@
       "outputs": [],
       "source": [
         "def get_fiscal_year(elements: dict) -> int:\n",
+        "    \"\"\"\n",
+        "    Extract fiscal year from document elements.\n",
+        "\n",
+        "    Args:\n",
+        "        elements (dict): Document elements\n",
+        "\n",
+        "    Returns:\n",
+        "        int: Year\n",
+        "    \"\"\"\n",
         "    # Regular expression pattern to find the element containing the fiscal year\n",
         "    pattern = r\"for the (fiscal\\s+)?year ended.*?(\\d{4})\"\n",
         "    year = 0\n",
@@ -352,10 +361,25 @@
         "                year = int(year)\n",
         "            except:\n",
         "                year = 0\n",
-        "    return year\n",
+        "    return year"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def get_company_name(elements: dict) -> str:\n",
+        "    \"\"\"\n",
+        "    Extract company name from document elements.\n",
         "\n",
+        "    Args:\n",
+        "        elements (dict): Document elements\n",
         "\n",
-        "def get_company_name(elements: dict) -> str:\n",
+        "    Returns:\n",
+        "        str: Company name\n",
+        "    \"\"\"\n",
         "    name = \"\"\n",
         "    # In most cases the name of the company is right before/above the following line\n",
         "    substring = \"(Exact name of registrant as specified\"\n",
@@ -409,7 +433,13 @@
       },
       "outputs": [],
       "source": [
-        "def add_custom_metadata_to_json_outputs(directory):\n",
+        "def add_custom_metadata_to_json_outputs(directory: str) -> None:\n",
+        "    \"\"\"\n",
+        "    Add custom metadata to processed documents.\n",
+        "\n",
+        "    Args:\n",
+        "        directory (str): Directory to read files from\n",
+        "    \"\"\"\n",
         "    for filename in os.listdir(directory):\n",
         "        if filename.endswith(\".json\"):\n",
         "            file_path = os.path.join(directory, filename)\n",
@@ -449,7 +479,7 @@
       },
       "outputs": [],
       "source": [
-        "add_custom_metadata_to_json_outputs(f\"{work_dir}/embed\")"
+        "add_custom_metadata_to_json_outputs(f\"{WORK_DIR}/embed\")"
       ]
     },
     {
@@ -461,7 +491,7 @@
         "## Step 5: Write the results to MongoDB\n",
         "\n",
         "To write the results to MongoDB, we will need to rerun the same pipeline, except we'll now change the destination from local to MongoDB.\n",
-        "The pipeline will not repeat partitioning, chunking and embedding steps, since there are results for them already cached in the `work_dir`. It will pick up the customized embedding results and load them into a MongoDB collection.\n",
+        "The pipeline will not repeat partitioning, chunking and embedding steps, since there are results for them already cached in the `WORKING_DIR`. It will pick up the customized embedding results and load them into a MongoDB collection.\n",
         "\n"
       ]
     },
@@ -495,7 +525,7 @@
       "source": [
         "Pipeline.from_configs(\n",
         "    context=ProcessorConfig(\n",
-        "        verbose=True, tqdm=True, num_processes=5, work_dir=work_dir\n",
+        "        verbose=True, tqdm=True, num_processes=5, work_dir=WORK_DIR\n",
         "    ),\n",
         "    indexer_config=S3IndexerConfig(remote_url=AWS_S3_NAME),\n",
         "    downloader_config=S3DownloaderConfig(),\n",