Skip to content

Commit

Permalink
Add support to unstructrued (microsoft#501)
Browse files Browse the repository at this point in the history
* Add support to unstructrued

* Fix tests

* Add test and documents

* Fix tests

* Fix tests

* Test unstructured on linux and mac
  • Loading branch information
thinkall authored Nov 5, 2023
1 parent 0dd0fc5 commit f052977
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 11 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ jobs:
python -c "import autogen"
pip install -e. pytest
pip uninstall -y openai
- name: Install unstructured if not windows
if: matrix.os != 'windows-2019'
run: |
pip install "unstructured[all-docs]"
- name: Test with pytest
if: matrix.python-version != '3.10'
run: |
Expand Down
16 changes: 15 additions & 1 deletion autogen/retrieve_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@
import pypdf
from autogen.token_count_utils import count_token

try:
from unstructured.partition.auto import partition

HAS_UNSTRUCTURED = True
except ImportError:
HAS_UNSTRUCTURED = False

logger = logging.getLogger(__name__)
TEXT_FORMATS = [
"txt",
Expand All @@ -33,6 +40,10 @@
"yml",
"pdf",
]
UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
if HAS_UNSTRUCTURED:
TEXT_FORMATS += UNSTRUCTURED_FORMATS
TEXT_FORMATS = list(set(TEXT_FORMATS))
VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})


Expand Down Expand Up @@ -123,7 +134,10 @@ def split_files_to_chunks(
_, file_extension = os.path.splitext(file)
file_extension = file_extension.lower()

if file_extension == ".pdf":
if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
text = partition(file)
text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
elif file_extension == ".pdf":
text = extract_text_from_pdf(file)
else: # For non-PDF text-based files
with open(file, "r", encoding="utf-8", errors="ignore") as f:
Expand Down
Binary file added test/test_files/example.docx
Binary file not shown.
35 changes: 31 additions & 4 deletions test/test_retrieve_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,15 @@
else:
skip = False
import os
import sys
import pytest

try:
from unstructured.partition.auto import partition

HAS_UNSTRUCTURED = True
except ImportError:
HAS_UNSTRUCTURED = False

test_dir = os.path.join(os.path.dirname(__file__), "test_files")
expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
Expand Down Expand Up @@ -47,7 +54,10 @@ def test_split_files_to_chunks(self):
pdf_file_path = os.path.join(test_dir, "example.pdf")
txt_file_path = os.path.join(test_dir, "example.txt")
chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks)
assert all(
isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
for chunk in chunks
)

def test_get_files_from_dir(self):
files = get_files_from_dir(test_dir)
Expand Down Expand Up @@ -161,14 +171,17 @@ def custom_text_split_function(text):
)
results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
assert (
results.get("documents")[0][0]
== "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o"
"AutoGen is an advanced tool designed to assist developers in harnessing the capabilities"
in results.get("documents")[0][0]
)

def test_retrieve_utils(self):
client = chromadb.PersistentClient(path="/tmp/chromadb")
create_vector_db_from_dir(
dir_path="./website/docs", client=client, collection_name="autogen-docs", get_or_create=True
dir_path="./website/docs",
client=client,
collection_name="autogen-docs",
get_or_create=True,
)
results = query_vector_db(
query_texts=[
Expand All @@ -182,6 +195,20 @@ def test_retrieve_utils(self):
print(results["ids"][0])
assert len(results["ids"][0]) == 4

@pytest.mark.skipif(
not HAS_UNSTRUCTURED,
reason="do not run if unstructured is not installed",
)
def test_unstructured(self):
pdf_file_path = os.path.join(test_dir, "example.pdf")
txt_file_path = os.path.join(test_dir, "example.txt")
word_file_path = os.path.join(test_dir, "example.docx")
chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path])
assert all(
isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
for chunk in chunks
)


if __name__ == "__main__":
pytest.main()
Expand Down
10 changes: 10 additions & 0 deletions website/blog/2023-10-18-RetrieveChat/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ Please install pyautogen with the [retrievechat] option before using RAG agents.
pip install "pyautogen[retrievechat]"
```

RetrieveChat can handle various types of documents. By default, it can process
plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.

You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.

1. Import Agents
```python
import autogen
Expand Down Expand Up @@ -474,3 +483,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
You can check out more example notebooks for RAG use cases:
- [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
- [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
29 changes: 23 additions & 6 deletions website/docs/Installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Inference parameter tuning can be done via [`flaml.tune`](https://microsoft.gith
- `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None.

### Optional Dependencies
* docker
- #### docker

For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.

Expand All @@ -77,34 +77,51 @@ When running AutoGen out of a docker container, to use docker for code execution
pip install docker
```

* blendsearch
- #### blendsearch

`pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
```bash
pip install "pyautogen[blendsearch]<0.2"
```

Example notebooks:
[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb),

[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb)

[Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)

* retrievechat
- #### retrievechat

`pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
```bash
pip install "pyautogen[retrievechat]<0.2"
```

RetrieveChat can handle various types of documents. By default, it can process
plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.

You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.

Example notebooks:
[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb),

[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)

[Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)

* mathchat
[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)


- #### mathchat

`pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
```bash
pip install "pyautogen[mathchat]<0.2"
```

Example notebooks:

[Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)

0 comments on commit f052977

Please sign in to comment.