improve doc parsing (langchain-ai#161)

arianpasquali · Sep 26, 2023 · 0a5bbd2 · 0a5bbd2
1 parent b7ec169
commit 0a5bbd2
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 4 deletions.
diff --git a/ingest.py b/ingest.py
@@ -21,6 +21,7 @@
 WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]
 RECORD_MANAGER_DB_URL = os.environ["RECORD_MANAGER_DB_URL"]
 
+
 def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
     title = soup.find("title")
     description = soup.find("meta", attrs={"name": "description"})
@@ -30,7 +31,7 @@ def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
         "title": title.get_text() if title else "",
         "description": description.get("content", "") if description else "",
         "language": html.get("lang", "") if html else "",
-        **meta
+        **meta,
     }
 
 
@@ -41,15 +42,17 @@ def load_langchain_docs():
         parsing_function=langchain_docs_extractor,
         default_parser="lxml",
         bs_kwargs={
-            "parse_only": SoupStrainer(name="article"),
+            "parse_only": SoupStrainer(
+                name=("article", "title", "html", "lang", "content")
+            ),
         },
         meta_function=metadata_extractor,
     ).load()
 
 
 def simple_extractor(html: str) -> str:
     soup = BeautifulSoup(html, "lxml")
-    return re.sub(r"\n\n+", "\n\n", soup.text)
+    return re.sub(r"\n\n+", "\n\n", soup.text).strip()
 
 
 def load_api_docs():

diff --git a/parser.py b/parser.py
@@ -106,4 +106,5 @@ def get_text(tag: Tag) -> Generator[str, None, None]:
                 else:
                     yield from get_text(child)
 
-    return "".join(get_text(soup))
+    joined = "".join(get_text(soup))
+    return re.sub(r"\n\n+", "\n\n", joined).strip()