diff --git a/.github/workflows/jmh-bechmarks.yml b/.github/workflows/jmh-bechmarks.yml new file mode 100644 index 000000000000..a20b90dac1e8 --- /dev/null +++ b/.github/workflows/jmh-bechmarks.yml @@ -0,0 +1,92 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "JMH Benchmarks" +on: + workflow_dispatch: + inputs: + repo: + description: 'Repo name with owner, such as apache/iceberg' + required: true + ref: + description: 'The branch name' + required: true + benchmarks: + description: 'A list of comma-separated double-quoted Benchmark names, such as "IcebergSourceFlatParquetDataReadBenchmark", "IcebergSourceFlatParquetDataFilterBenchmark"' + required: true + +jobs: + matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + foundlabel: ${{ steps.set-matrix.outputs.foundlabel }} + steps: + - uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.ref }} + - id: set-matrix + run: | + matrix=$(echo '[${{ github.event.inputs.benchmarks }}]' | jq '.[] | select(endswith("Benchmark")) | .') + matrix=$(echo $matrix | sed 's/ /,/g' | sed 's/"/\"/g') + echo "::set-output name=matrix::[$matrix]" + echo "::set-output name=foundlabel::$(echo "[$matrix]" | jq 'if . | length > 0 then true else false end')" + + show-matrix: + needs: matrix + runs-on: ubuntu-latest + steps: + - run: | + echo "Repo: ${{ github.event.inputs.repo }}" + echo "Ref: ${{ github.event.inputs.ref }}" + echo "Benchmarks: ${{ needs.matrix.outputs.matrix }}" + echo "Found Benchmarks? ${{ needs.matrix.outputs.foundlabel }}" + + run-benchmark: + if: ${{ needs.matrix.outputs.foundlabel == 'true' }} + needs: matrix + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + benchmark: ${{ fromJson(needs.matrix.outputs.matrix) }} + env: + SPARK_LOCAL_IP: localhost + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-java@v1 + with: + java-version: 11 + - uses: actions/cache@v2 + with: + path: ~/.gradle/caches + key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle') }} + restore-keys: ${{ runner.os }}-gradle + - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts + + - name: Run Benchmark + run: ./gradlew :iceberg-spark:iceberg-spark3:jmh -PjmhIncludeRegex=${{ matrix.benchmark }} -PjmhOutputPath=benchmark/${{ matrix.benchmark }}.txt + + - uses: actions/upload-artifact@v2 + if: ${{ always() }} + with: + name: benchmark-results + path: | + **/benchmark/*.txt diff --git a/site/docs/benchmarks.md b/site/docs/benchmarks.md index 02721c4396ea..7d87dbf6179a 100644 --- a/site/docs/benchmarks.md +++ b/site/docs/benchmarks.md @@ -22,6 +22,23 @@ Benchmarks are located under `/jmh`. It is generally favorable to only run the tests of interest rather than running all available benchmarks. Also note that JMH benchmarks run within the same JVM as the system-under-test, so results might vary between runs. +## Running Benchmarks on GitHub + +It is possible to run one or more Benchmarks via the **JMH Benchmarks** GH action on your own fork of the Iceberg repo. This GH action takes the following inputs: +* The repository name where those benchmarks should be run against, such as `apache/iceberg` or `/iceberg` +* The branch name to run benchmarks against, such as `master` or `my-cool-feature-branch` +* A list of comma-separated double-quoted Benchmark names, such as `"IcebergSourceFlatParquetDataReadBenchmark", "IcebergSourceFlatParquetDataFilterBenchmark", "IcebergSourceNestedListParquetDataWriteBenchmark"` + +Benchmark results will be uploaded once **all** benchmarks are done. + +It is worth noting that the GH runners have limited resources so the benchmark results should rather be seen as an indicator to guide developers in understanding code changes. +It is likely that there is variability in results across different runs, therefore the benchmark results shouldn't be used to form assumptions around production choices. + + +## Running Benchmarks locally + +Below are the existing benchmarks shown with the actual commands on how to run them locally. + ### IcebergSourceNestedListParquetDataWriteBenchmark A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: