Merge pull request #102 from Azure-Samples/installable

More generic, installable
Azure-Samples · Oct 25, 2024 · 7441f26 · 7441f26
2 parents b0697bf + 66b317e
commit 7441f26
Show file tree

Hide file tree

Showing 46 changed files with 494 additions and 318 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -19,5 +19,5 @@
     "hostRequirements": {
         "memory": "8gb"
     },
-    "postCreateCommand": "pip install -r requirements-dev.txt"
+    "postCreateCommand": "pip install -e .\"[dev]\""
 }
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -61,7 +61,7 @@ Before you submit your Pull Request (PR) consider the following guidelines:
 * Install the development tools and pre-commit hooks:
 
    ```shell
-   python3 -m pip install -r requirements-dev.txt
+   python3 -m pip install -e ."[dev"]
    pre-commit install
    ```
 

diff --git a/.github/workflows/azure-dev.yaml b/.github/workflows/azure-dev.yaml
@@ -92,12 +92,12 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          pip install -e .[dev]
 
       - name: Run evaluation
         run: |
           azd env get-values > .env
           source .env
-          python -m scripts evaluate --config=example_config.json --numquestions=2 --targeturl=${{ env.TARGET_URL }}
+          python -m evaltools evaluate --config=example_config.json --numquestions=2 --targeturl=${{ env.TARGET_URL }}
         env:
           TARGET_URL: ${{ secrets.TARGET_URL }}
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -39,10 +39,10 @@ jobs:
         - name: Install dependencies
           run: |
             python -m pip install --upgrade pip
-            pip install -r requirements-dev.txt
+            pip install -e .[dev]
         - name: Lint with ruff
           run: ruff check .
-        - name: Check formatting with black
-          run: black . --check --verbose
+        - name: Check formatting with ruff
+          run: ruff format . --check
         - name: Run Pytest tests
-          run: python3 -m pytest
+          run: python -m pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,10 +6,10 @@ repos:
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.0
+    rev: v0.6.2
     hooks:
-    -   id: ruff
--   repo: https://github.com/psf/black
-    rev: 24.1.1
-    hooks:
-    -   id: black
+        # Run the linter.
+        - id: ruff
+          args: [ --fix ]
+        # Run the formatter.
+        - id: ruff-format
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,7 +1,5 @@
 {
-    "python.testing.pytestArgs": [
-        "scripts"
-    ],
+    "python.testing.pytestArgs": ["tests"],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
     "files.exclude": {

diff --git a/README.md b/README.md
@@ -38,10 +38,10 @@ If not, then follow these steps:
 
 1. Install Python 3.10 or higher
 2. Create a Python [virtual environment](https://learn.microsoft.com/azure/developer/python/get-started?tabs=cmd#configure-python-virtual-environment).
-3. Inside that virtual environment, install the requirements:
+3. Inside that virtual environment, install the project:
 
     ```shell
-    python -m pip install -r requirements.txt
+    python -m pip install -e .
     ```
 
 ## Deploying a GPT-4 model
@@ -57,11 +57,17 @@ We've made that easy to deploy with the `azd` CLI tool.
 1. Install the [Azure Developer CLI](https://aka.ms/azure-dev/install)
 2. Run `azd auth login` to log in to your Azure account
 3. Run `azd up` to deploy a new GPT-4 instance
-4. Create a `.env` file based on the provisioned resources by copying `.env.sample` and filling in the required values.
-   You can run this command to see the deployed values:
+4. Create a `.env` file based on `.env.sample`:
 
     ```shell
-    azd env get-values
+    cp .env.sample .env
+    ```
+
+5. Run this commands to get the required values for `AZURE_OPENAI_EVAL_DEPLOYMENT` and `AZURE_OPENAI_SERVICE` from your deployed resource group and paste those values into the `.env` file:
+
+    ```shell
+    azd env get-value AZURE_OPENAI_EVAL_DEPLOYMENT
+    azd env get-value AZURE_OPENAI_SERVICE
     ```
 
 ### Using an existing Azure OpenAI instance
@@ -133,7 +139,7 @@ This repo includes a script for generating questions and answers from documents
 3. Run the generator script:
 
     ```shell
-    python -m scripts generate --output=example_input/qa.jsonl --numquestions=200 --persource=5
+    python -m evaltools generate --output=example_input/qa.jsonl --persource=5 --numquestions=200
     ```
 
     That script will generate 200 questions and answers, and store them in `example_input/qa.jsonl`. We've already provided an example based off the sample documents for this app.
@@ -145,15 +151,15 @@ This repo includes a script for generating questions and answers from documents
     By default this script assumes your index citation field is named `sourcepage`, if your search index contains a different citation field name use the `citationfieldname` option to specify the correct name
 
     ```shell
-    python -m scripts generate --output=example_input/qa.jsonl --numquestions=200 --persource=5 --citationfieldname=filepath
+    python -m evaltools generate --output=example_input/qa.jsonl --persource=5 --numquestions=200 --citationfieldname=filepath
     ```
 
 ## Running an evaluation
 
 We provide a script that loads in the current `azd` environment's variables, installs the requirements for the evaluation, and runs the evaluation against the local app. Run it like this:
 
 ```shell
-python -m scripts evaluate --config=example_config.json
+python -m evaltools evaluate --config=example_config.json
 ```
 
 The config.json should contain these fields as a minimum:
@@ -184,7 +190,7 @@ To run against a deployed endpoint, change the `target_url` to the chat endpoint
 It's common to run the evaluation on a subset of the questions, to get a quick sense of how the changes are affecting the answers. To do this, use the `--numquestions` parameter:
 
 ```shell
-python -m scripts evaluate --config=example_config.json --numquestions=2
+python -m evaltools evaluate --config=example_config.json --numquestions=2
 ```
 
 ### Specifying the evaluate metrics
@@ -280,7 +286,7 @@ located inside the `review-tools` folder.
 To view a summary across all the runs, use the `summary` command with the path to the results folder:
 
 ```bash
-python -m review_tools summary example_results
+python -m evaltools summary example_results
 ```
 
 This will display an interactive table with the results for each run, like this:
@@ -295,7 +301,7 @@ A modal will appear with the parameters, including any prompt override.
 To compare the answers generated for each question across 2 runs, use the `compare` command with 2 paths:
 
 ```bash
-python -m review_tools diff example_results/baseline_1 example_results/baseline_2
+python -m evaltools diff example_results/baseline_1 example_results/baseline_2
 ```
 
 This will display each question, one at a time, with the two generated answers in scrollable panes,
@@ -308,7 +314,7 @@ Use the buttons at the bottom to navigate to the next question or quit the tool.
 You can also filter to only show questions where the value changed for a particular metric, like this:
 
 ```bash
-python -m review_tools diff example_results/baseline_1 example_results/baseline_2 --changed=has_citation
+python -m evaltools diff example_results/baseline_1 example_results/baseline_2 --changed=has_citation
 ```
 
 ## Measuring app's ability to say "I don't know"
@@ -329,7 +335,7 @@ You can write these questions manually, but it’s also possible to generate the
 assuming you already have ground truth data with answerable questions.
 
 ```shell
-python -m scripts generate-dontknows --input=example_input/qa.jsonl --output=example_input/qa_dontknows.jsonl --numquestions=45
+python -m evaltools generate-dontknows --input=example_input/qa.jsonl --output=example_input/qa_dontknows.jsonl --numquestions=45
 ```
 
 That script sends the current questions to the configured GPT-4 model along with prompts to generate questions of each kind.
@@ -360,7 +366,7 @@ We recommend a separate output folder, as you'll likely want to make multiple ru
 Run the evaluation like this:
 
 ```shell
-python -m scripts evaluate --config=dontknows.config.json
+python -m evaltools evaluate --config=dontknows.config.json
 ```
 
 The results will be stored in the `results_dir` folder, and can be reviewed using the [review tools](#viewing-the-results).

diff --git a/dontknows.config.json b/dontknows.config.json
@@ -1,11 +1,27 @@
 {
     "testdata_path": "example_input/qa_dontknows.jsonl",
-    "results_dir": "blog_results_dontknows/gpt35_prompt2",
+    "results_dir": "example_results_dontknows/baseline",
     "requested_metrics": ["dontknowness", "answer_length", "latency", "has_citation"],
-    "target_url": "http://host.docker.internal:50505/chat",
+    "target_url": "http://localhost:50505/chat",
     "target_parameters": {
         "overrides": {
-            "prompt_template": "<READFILE>example_input/prompt_refined.txt"
+            "top": 3,
+            "temperature": 0.3,
+            "minimum_reranker_score": 0,
+            "minimum_search_score": 0,
+            "retrieval_mode": "hybrid",
+            "semantic_ranker": true,
+            "semantic_captions": false,
+            "suggest_followup_questions": false,
+            "use_oid_security_filter": false,
+            "use_groups_security_filter": false,
+            "vector_fields": [
+                "embedding"
+            ],
+            "use_gpt4v": false,
+            "gpt4v_input": "textAndImages"
         }
-    }
+    },
+    "target_response_answer_jmespath": "message.content",
+    "target_response_context_jmespath": "context.data_points.text"
 }
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,7 @@
 [tool.ruff]
 line-length = 120
 target-version = "py39"
+lint.isort.known-first-party = ["evaltools"]
 lint.select = ["E", "F", "I", "UP"]
 lint.ignore = ["D203"]
 
@@ -10,3 +11,32 @@ target-version = ["py39"]
 
 [tool.pytest.ini_options]
 addopts = "-ra"
+
+[project]
+name = "evaltools"
+version = "0.1.1"
+description = "Evaluate chat applications using Azure OpenAI evaluators"
+dependencies = [
+    "requests",
+    "python-dotenv",
+    "azure-ai-generative[evaluate]==1.0.0b8",
+    "azure-ai-evaluation==1.0.0b3",
+    "azure-search-documents",
+    "typer",
+    "openai>=1.0.0",
+    "pandas",
+    "rich",
+    "jmespath",
+    "textual"
+]
+
+[project.optional-dependencies]
+dev = [
+    "pre-commit",
+    "ruff",
+    "black",
+    "pytest"
+]
+
+[tool.setuptools.package-data]
+evaltools = ["review/*.tcss"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
diff --git a/requirements.txt b/requirements.txt
diff --git a/review_tools/__main__.py b/review_tools/__main__.py
diff --git a/review_tools/cli.py b/review_tools/cli.py