Skip to content

Commit

Permalink
[async] Add advection benchmark (taichi-dev#1914)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuanming-hu authored Oct 1, 2020
1 parent 3aeff7c commit 75d5ecc
Show file tree
Hide file tree
Showing 10 changed files with 184 additions and 44 deletions.
125 changes: 125 additions & 0 deletions benchmarks/async_advection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import taichi as ti
import math

from utils import benchmark_async

# TODO: staggerred grid


@benchmark_async
def advection_2d(scale):
n = 128 * 2**int((math.log(scale, 2)) // 2)
x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n))
new_x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n))
v = ti.Vector.field(2, dtype=ti.f32, shape=(n, n))
dx = 1 / n
inv_dx = 1 / dx
dt = 0.01

stagger = ti.Vector([0.5, 0.5])

@ti.func
def Vector2(x, y):
return ti.Vector([x, y])

@ti.kernel
def init():
for i, j in v:
v[i, j] = ti.Vector([j / n - 0.5, 0.5 - i / n])

for i, j in ti.ndrange(n * 4, n * 4):
ret = ti.taichi_logo(ti.Vector([i, j]) / (n * 4))
x[i // 4, j // 4][0] += ret / 16
x[i // 4, j // 4][1] += ret / 16
x[i // 4, j // 4][2] += ret / 16

@ti.func
def vec(x, y):
return ti.Vector([x, y])

@ti.func
def clamp(p):
for d in ti.static(range(p.n)):
p[d] = min(1 - 1e-4 - dx + stagger[d] * dx,
max(p[d], stagger[d] * dx))
return p

@ti.func
def sample_bilinear(x, p):
p = clamp(p)

p_grid = p * inv_dx - stagger

I = ti.cast(ti.floor(p_grid), ti.i32)
f = p_grid - I
g = 1 - f

return x[I] * (g[0] * g[1]) + x[I + vec(1, 0)] * (f[0] * g[1]) + x[
I + vec(0, 1)] * (g[0] * f[1]) + x[I + vec(1, 1)] * (f[0] * f[1])

@ti.func
def velocity(p):
return sample_bilinear(v, p)

@ti.func
def sample_min(x, p):
p = clamp(p)
p_grid = p * inv_dx - stagger
I = ti.cast(ti.floor(p_grid), ti.i32)

return min(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)])

@ti.func
def sample_max(x, p):
p = clamp(p)
p_grid = p * inv_dx - stagger
I = ti.cast(ti.floor(p_grid), ti.i32)

return max(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)])

@ti.func
def backtrace(I, dt): # RK3
p = (I + stagger) * dx
v1 = velocity(p)
p1 = p - 0.5 * dt * v1
v2 = velocity(p1)
p2 = p - 0.75 * dt * v2
v3 = velocity(p2)
p -= dt * (2 / 9 * v1 + 1 / 3 * v2 + 4 / 9 * v3)
return p

@ti.func
def semi_lagrangian(x, new_x, dt):
for I in ti.grouped(x):
new_x[I] = sample_bilinear(x, backtrace(I, dt))

@ti.kernel
def advect():
semi_lagrangian(x(0), new_x(0), dt)
semi_lagrangian(x(1), new_x(1), dt)
semi_lagrangian(x(2), new_x(2), dt)

for I in ti.grouped(x):
x[I] = new_x[I]

init()

def task():
for i in range(10):
advect()

ti.benchmark(task, repeat=100)

visualize = False

if visualize:
gui = ti.GUI('Advection schemes', (n, n))
for i in range(10):
for _ in range(10):
advect()
gui.set_image(x.to_numpy())
gui.show()


if __name__ == '__main__':
advection_2d()
30 changes: 6 additions & 24 deletions benchmarks/async_cases.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,34 @@
import taichi as ti
import os
import sys
import functools

sys.path.append(os.path.join(ti.core.get_repo_dir(), 'tests', 'python'))

from fuse_test_template import template_fuse_dense_x2y2z, \
template_fuse_reduction


# Note: this is a short-term solution. In the long run we need to think about how to reuse pytest
def benchmark_async(func):
@functools.wraps(func)
def body():
for arch in [ti.cpu, ti.cuda]:
for async_mode in [True, False]:
os.environ['TI_CURRENT_BENCHMARK'] = func.__name__
ti.init(arch=arch, async_mode=async_mode)
if arch == ti.cpu:
scale = 2
else:
# Use more data to hide compilation overhead
# (since CUDA runs much faster than CPUs)
scale = 64
func(scale)

return body
from utils import *


@benchmark_async
def fuse_dense_x2y2z(scale):
template_fuse_dense_x2y2z(size=scale * 10 * 1024**2,
template_fuse_dense_x2y2z(size=scale * 1024**2,
repeat=1,
benchmark_repeat=100,
benchmark=True)


@benchmark_async
def fuse_reduction(scale):
template_fuse_reduction(size=scale * 10 * 1024**2,
template_fuse_reduction(size=scale * 1024**2,
repeat=10,
benchmark_repeat=10,
benchmark=True)


@benchmark_async
def fill_1d(scale):
a = ti.field(dtype=ti.f32, shape=scale * 10 * 1024**2)
a = ti.field(dtype=ti.f32, shape=scale * 1024**2)

@ti.kernel
def fill():
Expand Down Expand Up @@ -81,7 +63,7 @@ def sparse_numpy(scale):
a = ti.field(dtype=ti.f32)
b = ti.field(dtype=ti.f32)

block_count = 2**int((math.log(scale, 2)) // 2) * 64
block_count = 2**int((math.log(scale, 2)) // 2) * 4
block_size = 32
# a, b always share the same sparsity
ti.root.pointer(ti.ij, block_count).dense(ti.ij, block_size).place(a, b)
Expand Down Expand Up @@ -145,7 +127,7 @@ def stencil_reduction(scale):
b = ti.field(dtype=ti.f32)
total = ti.field(dtype=ti.f32, shape=())

block_count = scale * 512
block_count = scale * 64
block_size = 1024
# a, b always share the same sparsity
ti.root.pointer(ti.i, block_count).dense(ti.i, block_size).place(a, b)
Expand Down
10 changes: 3 additions & 7 deletions benchmarks/benchmark_async.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import taichi as ti

from async_cases import *
from async_advection import *

rerun = True

cases = [
fuse_dense_x2y2z,
fuse_reduction,
fill_1d,
sparse_numpy,
autodiff,
stencil_reduction,
# mpm_splitted,
fuse_dense_x2y2z, fuse_reduction, fill_1d, sparse_numpy, autodiff,
stencil_reduction, mpm_splitted, advection_2d
]

if rerun:
Expand Down
21 changes: 21 additions & 0 deletions benchmarks/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import taichi as ti
import functools
import os


def benchmark_async(func):
@functools.wraps(func)
def body():
for arch in [ti.cpu, ti.cuda]:
for async_mode in [True, False]:
os.environ['TI_CURRENT_BENCHMARK'] = func.__name__
ti.init(arch=arch, async_mode=async_mode, kernel_profiler=True)
if arch == ti.cpu:
scale = 2
else:
# Use more data to hide compilation overhead
# (since CUDA runs much faster than CPUs)
scale = 64
func(scale)

return body
3 changes: 2 additions & 1 deletion examples/mpm_lagrangian_forces.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import taichi as ti
import numpy as np

ti.init(arch=ti.gpu)
ti.init(arch=ti.gpu, kernel_profiler=True)

dim = 2
quality = 8 # Use a larger integral number for higher quality
Expand Down Expand Up @@ -187,6 +187,7 @@ def main():
color=0xFFFFFF,
radius=3)
gui.show()
ti.kernel_profiler_print()


if __name__ == '__main__':
Expand Down
16 changes: 10 additions & 6 deletions python/taichi/lang/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@
cc = core.cc
gpu = [cuda, metal, opengl]
cpu = core.host_arch()
kernel_profiler_print = lambda: core.get_current_program(
).kernel_profiler_print()
kernel_profiler_clear = lambda: core.get_current_program(
).kernel_profiler_clear()
kernel_profiler_print = lambda: get_runtime().prog.kernel_profiler_print()
kernel_profiler_clear = lambda: get_runtime().prog.kernel_profiler_clear()
kernel_profiler_total_time = lambda: get_runtime(
).prog.kernel_profiler_total_time()


def memory_profiler_print():
Expand Down Expand Up @@ -331,19 +331,23 @@ def run_benchmark():
ti.stat_write('offloaded_tasks', b)
elif a == 'launched_tasks':
ti.stat_write('launched_tasks', b)
# The reason why we run 3 more times is to warm up

# Use 3 initial iterations to warm up
# instruction/data caches. Discussion:
# https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
for i in range(3):
func(*args)
ti.sync()
ti.kernel_profiler_clear()
t = time.time()
for n in range(repeat):
func(*args)
ti.sync()
elapsed = time.time() - t
avg = elapsed / repeat
ti.stat_write('running_time', avg)
ti.stat_write('clock_time', avg)
device_time = ti.kernel_profiler_total_time()
ti.stat_write('device_time', device_time)

run_benchmark()

Expand Down
13 changes: 9 additions & 4 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void KernelProfilerBase::print() {
"name\n");
std::sort(records.begin(), records.end());
for (auto &rec : records) {
auto fraction = rec.total / total_time * 100.0f;
auto fraction = rec.total / total_time_ms * 100.0f;
fmt::print("[{:6.2f}% {:7.3f} s {:6d}x |{:9.3f} {:9.3f} {:9.3f} ms] {}\n",
fraction, rec.total / 1000.0f, rec.counter, rec.min,
rec.total / rec.counter, rec.max, rec.name);
Expand All @@ -53,12 +53,17 @@ void KernelProfilerBase::print() {
fmt::print(
"[100.00%] Total kernel execution time: {:7.3f} s number of records: "
"{}\n",
total_time / 1000.0f, records.size());
get_total_time(), records.size());

fmt::print(
"========================================================================"
"=\n");
}

double KernelProfilerBase::get_total_time() const {
return total_time_ms / 1000.0;
}

namespace {
// A simple profiler that uses Time::get_time()
class DefaultProfiler : public KernelProfilerBase {
Expand Down Expand Up @@ -90,7 +95,7 @@ class DefaultProfiler : public KernelProfilerBase {
it = std::prev(records.end());
}
it->insert_sample(ms);
total_time += ms;
total_time_ms += ms;
}

private:
Expand Down Expand Up @@ -150,7 +155,7 @@ class KernelProfilerCUDA : public KernelProfilerBase {
it = std::prev(records.end());
}
it->insert_sample(ms);
total_time += ms;
total_time_ms += ms;
}
}
outstanding_events.clear();
Expand Down
6 changes: 4 additions & 2 deletions taichi/program/kernel_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ struct KernelProfileRecord {
class KernelProfilerBase {
protected:
std::vector<KernelProfileRecord> records;
double total_time;
double total_time_ms;

public:
// Needed for the CUDA backend since we need to know which task to "stop"
using TaskHandle = void *;

void clear() {
total_time = 0;
total_time_ms = 0;
records.clear();
}

Expand All @@ -62,6 +62,8 @@ class KernelProfilerBase {

void print();

double get_total_time() const;

virtual ~KernelProfilerBase() {
}
};
Expand Down
2 changes: 2 additions & 0 deletions taichi/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ void Program::synchronize() {
if (config.async_mode) {
async_engine->synchronize();
}
if (profiler)
profiler->sync();
device_synchronize();
sync = true;
}
Expand Down
2 changes: 2 additions & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ void export_lang(py::module &m) {
.def(py::init<>())
.def_readonly("config", &Program::config)
.def("kernel_profiler_print", &Program::kernel_profiler_print)
.def("kernel_profiler_total_time",
[](Program *program) { return program->profiler->get_total_time(); })
.def("kernel_profiler_clear", &Program::kernel_profiler_clear)
.def("print_memory_profiler_info", &Program::print_memory_profiler_info)
.def("finalize", &Program::finalize)
Expand Down

0 comments on commit 75d5ecc

Please sign in to comment.