From 053aa0da1540bbf0bb449e8f2a331c9bf55ad7a7 Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Mon, 10 Jun 2024 22:51:06 -0400 Subject: [PATCH] [CI] Run the documentation workload on `a100-runner-set` (#4118) --- .github/workflows/documentation.yml | 2 +- python/tutorials/09-persistent-fp8-matmul.py | 46 ++++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index fa4b18645ba5..66fac582fbc4 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -8,7 +8,7 @@ permissions: read-all jobs: Build-Documentation: - runs-on: [self-hosted, A100] + runs-on: [a100-runner-set] timeout-minutes: 30 steps: diff --git a/python/tutorials/09-persistent-fp8-matmul.py b/python/tutorials/09-persistent-fp8-matmul.py index e01428c4d259..0b24ce4f4204 100644 --- a/python/tutorials/09-persistent-fp8-matmul.py +++ b/python/tutorials/09-persistent-fp8-matmul.py @@ -2,13 +2,12 @@ Persistent FP8 Matmul ===================== This script demonstrates persistent kernel implementations of matrix multiplication using Triton. -It includes various matmul methods, such as naive, persistent, and TMA (Tile Matrix Accumulation) based approaches, and only supports GPUs with compute capability >= 9.0. +It includes various matmul methods, such as naive, persistent, and TMA (Tensor Memory Accelerator) based approaches, and only supports GPUs with compute capability >= 9.0. Triton and CuBLAS implementations are benchmarked under different configurations and evaluated using the proton profiler. Users can pass command-line arguments to specify matrix dimensions and iteration steps flexibly. """ import argparse -import sys import time import numpy as np @@ -19,12 +18,9 @@ from triton._C.libtriton import nvidia -if not (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9): - print("This tutorial fp8_matmul is only supported on CUDA with cc >= 90") - sys.exit(0) - -cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8) -cublas = nvidia.cublas.CublasLt(cublas_workspace) +if torch.cuda.is_available(): + cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8) + cublas = nvidia.cublas.CublasLt(cublas_workspace) def _matmul_launch_metadata(grid, kernel, args): @@ -397,22 +393,26 @@ def validate(M, N, K): ) -parser = argparse.ArgumentParser() -parser.add_argument("-K", type=int, required=False) -parser.add_argument("--K_range", type=int, nargs=2) -parser.add_argument("--K_step", type=int, default=512) -args = parser.parse_args() +if __name__ == "__main__": + if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9: + parser = argparse.ArgumentParser() + parser.add_argument("-K", type=int, required=False) + parser.add_argument("--K_range", type=int, nargs=2) + parser.add_argument("--K_step", type=int, default=512) + args = parser.parse_args() -if args.K: - args.K_range = [args.K, args.K] - args.K_step = 1 # doesn't matter as long as it's not 0 + if args.K: + args.K_range = [args.K, args.K] + args.K_step = 1 # doesn't matter as long as it's not 0 -torch.manual_seed(0) + torch.manual_seed(0) -validate(32, 32, 32) -validate(8192, 8192, 512) + validate(32, 32, 32) + validate(8192, 8192, 512) -proton.start("matmul", hook="triton") -for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step): - bench(K) -proton.finalize() + proton.start("matmul", hook="triton") + for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step): + bench(K) + proton.finalize() + else: + print("This tutorial fp8_matmul is only supported on CUDA with cc >= 90")