Skip to content

Commit

Permalink
feat/remove ingest code, use new dep for tests (Unstructured-IO#3595)
Browse files Browse the repository at this point in the history
### Description
Alternative to Unstructured-IO#3572
but maintaining all ingest tests, running them by pulling in the latest
version of unstructured-ingest.

---------

Co-authored-by: ryannikolaidis <[email protected]>
Co-authored-by: rbiseck3 <[email protected]>
Co-authored-by: Christine Straub <[email protected]>
Co-authored-by: christinestraub <[email protected]>
  • Loading branch information
5 people authored Oct 15, 2024
1 parent ecf0267 commit 9049e4e
Show file tree
Hide file tree
Showing 608 changed files with 943 additions and 42,409 deletions.
2 changes: 1 addition & 1 deletion .github/actions/base-ingest-cache/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ runs:
python -m pip install --upgrade setuptools
fi
make install-ci
make install-all-ingest
make install-ingest
- name: Save Ingest Cache
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
id: ingest-virtualenv-cache-save
Expand Down
131 changes: 0 additions & 131 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ jobs:
- name: Install all doc and test dependencies
run: |
make install-ci
make install-all-ingest
make check-licenses
lint:
Expand Down Expand Up @@ -273,37 +272,6 @@ jobs:
python-version: ${{ matrix.python-version }}
check-only: 'true'

test_ingest_unit:
strategy:
matrix:
python-version: [ "3.9","3.10" ]
runs-on: ubuntu-latest
needs: [ setup_ingest, lint ]
steps:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Get full Python version
id: full-python-version
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
- name: Setup virtual environment
uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ matrix.python-version }}
- name: Test Ingest (unit)
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
PYTHON: python${{ matrix.python-version }}
run: |
source .venv/bin/activate
make install-ci
make install-all-ingest
PYTHONPATH=. ${PYTHON} -m pytest test_unstructured_ingest/unit
test_ingest_src:
strategy:
matrix:
Expand Down Expand Up @@ -378,8 +346,6 @@ jobs:
PYTHON: python${{ matrix.python-version }}
run: |
source .venv/bin/activate
make install-ci
make install-all-ingest
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
Expand All @@ -392,103 +358,6 @@ jobs:
./test_unstructured_ingest/test-ingest-src.sh
test_ingest_dest:
environment: ci
strategy:
matrix:
python-version: ["3.9","3.10"]
runs-on: ubuntu-latest-m
needs: [setup_ingest, lint]
steps:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Get full Python version
id: full-python-version
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
- name: Setup virtual environment
uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ matrix.python-version }}
- name: Setup docker-compose
uses: KengoTODA/actions-setup-docker-compose@v1
with:
version: '2.22.0'
- name: Test (end-to-end)
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }}
S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }}
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MONGODB_URI: ${{ secrets.MONGODB_URI }}
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
AZURE_DEST_CONNECTION_STR: ${{ secrets.AZURE_DEST_CONNECTION_STR }}
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
VECTARA_OAUTH_CLIENT_ID: ${{secrets.VECTARA_OAUTH_CLIENT_ID}}
VECTARA_OAUTH_SECRET: ${{secrets.VECTARA_OAUTH_SECRET}}
VECTARA_CUSTOMER_ID: ${{secrets.VECTARA_CUSTOMER_ID}}
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}}
DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}}
DATABRICKS_USERNAME: ${{secrets.DATABRICKS_USERNAME}}
DATABRICKS_PASSWORD: ${{secrets.DATABRICKS_PASSWORD}}
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
CI: "true"
NLTK_DATA: ${{ github.workspace }}/nltk_data
PYTHON: python${{ matrix.python-version }}
run: |
source .venv/bin/activate
make install-ci
make install-all-ingest
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
./test_unstructured_ingest/test-ingest-dest.sh
test_ingest_help:
environment: ci
strategy:
matrix:
python-version: ["3.9","3.10","3.11", "3.12"]
runs-on: ubuntu-latest
needs: [setup_ingest, lint]
steps:
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment
uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ matrix.python-version }}
- name: Validate --help
run: |
source .venv/bin/activate
make install-ci
make install-all-ingest
./test_unstructured_ingest/test-help.sh
test_unstructured_api_unit:
strategy:
matrix:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ingest-test-fixtures-update-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ jobs:
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
./test_unstructured_ingest/test-ingest-src.sh
Expand Down
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
## 0.15.15-dev0
## 0.16.0

### Enhancements

* **Remove ingest implementation.** The deprecated ingest functionality has been removed, as it is now maintained in the separate [unstructured-ingest](https://github.com/Unstructured-IO/unstructured-ingest) repository.
* Replace extras in `requirements/ingest` directory with a new `ingest.txt` extra for installing the `unstructured-ingest` library.
* Remove the `unstructured.ingest` submodule.
* Delete all shell scripts previously used for destination ingest tests.

### Features

### Fixes
Expand Down
42 changes: 0 additions & 42 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,45 +15,3 @@ include requirements/extra-pptx.in
include requirements/extra-xlsx.in
include requirements/huggingface.in

# Ingest extras
include requirements/ingest/airtable.in
include requirements/ingest/astradb.in
include requirements/ingest/azure-cognitive-search.in
include requirements/ingest/azure.in
include requirements/ingest/biomed.in
include requirements/ingest/box.in
include requirements/ingest/chroma.in
include requirements/ingest/confluence.in
include requirements/ingest/databricks-volumes.in
include requirements/ingest/delta-table.in
include requirements/ingest/discord.in
include requirements/ingest/dropbox.in
include requirements/ingest/elasticsearch.in
include requirements/ingest/embed-aws-bedrock.in
include requirements/ingest/embed-huggingface.in
include requirements/ingest/embed-mixedbreadai.in
include requirements/ingest/embed-openai.in
include requirements/ingest/gcs.in
include requirements/ingest/github.in
include requirements/ingest/gitlab.in
include requirements/ingest/google-drive.in
include requirements/ingest/hubspot.in
include requirements/ingest/jira.in
include requirements/ingest/kafka.in
include requirements/ingest/mongodb.in
include requirements/ingest/notion.in
include requirements/ingest/onedrive.in
include requirements/ingest/opensearch.in
include requirements/ingest/outlook.in
include requirements/ingest/pinecone.in
include requirements/ingest/postgres.in
include requirements/ingest/qdrant.in
include requirements/ingest/reddit.in
include requirements/ingest/s3.in
include requirements/ingest/salesforce.in
include requirements/ingest/sftp.in
include requirements/ingest/sharepoint.in
include requirements/ingest/slack.in
include requirements/ingest/singlestore.in
include requirements/ingest/weaviate.in
include requirements/ingest/wikipedia.in
Loading

0 comments on commit 9049e4e

Please sign in to comment.