Skip to content

Commit

Permalink
improve doc parsing (langchain-ai#161)
Browse files Browse the repository at this point in the history
  • Loading branch information
baskaryan authored Sep 26, 2023
1 parent b7ec169 commit 0a5bbd2
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
9 changes: 6 additions & 3 deletions ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]
RECORD_MANAGER_DB_URL = os.environ["RECORD_MANAGER_DB_URL"]


def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
title = soup.find("title")
description = soup.find("meta", attrs={"name": "description"})
Expand All @@ -30,7 +31,7 @@ def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
"title": title.get_text() if title else "",
"description": description.get("content", "") if description else "",
"language": html.get("lang", "") if html else "",
**meta
**meta,
}


Expand All @@ -41,15 +42,17 @@ def load_langchain_docs():
parsing_function=langchain_docs_extractor,
default_parser="lxml",
bs_kwargs={
"parse_only": SoupStrainer(name="article"),
"parse_only": SoupStrainer(
name=("article", "title", "html", "lang", "content")
),
},
meta_function=metadata_extractor,
).load()


def simple_extractor(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
return re.sub(r"\n\n+", "\n\n", soup.text)
return re.sub(r"\n\n+", "\n\n", soup.text).strip()


def load_api_docs():
Expand Down
3 changes: 2 additions & 1 deletion parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,5 @@ def get_text(tag: Tag) -> Generator[str, None, None]:
else:
yield from get_text(child)

return "".join(get_text(soup))
joined = "".join(get_text(soup))
return re.sub(r"\n\n+", "\n\n", joined).strip()

0 comments on commit 0a5bbd2

Please sign in to comment.