- I'm Roberto Gambuzzi
- Principal Software Engineer
- Writing Python since 2007
- Always learning new languages, but always coming back to Python.
- If you want to connect https://linktr.ee/gambuzzi
- advent of code solutions
- python
- numba
- cuda
If you never did Advent of Code, Advent of code is a code challenge that is published every year in the Christmas period.
There are 24 challenges every year, you can find them on https://adventofcode.com/
https://adventofcode.com/2021/day/20
Longstory Short: you have an initial 1-bit image and you have rules on how to update every pixel of this image. You get a new image. Repeat.
In PART 1 you are asked of doing it 2 times.
In PART 2 you are asked of doing it 50 times.
You are given an image enhancement algorithm (a string representing 512 bits) and an image.
For each pixel
- you have to take all the touching pixel and the pixel itself (3x3 = 9 pixels),
- create a 9 bit number based on these pixels
- do a lookup to the appropriate bit in the image enhancement algorithm string
- write the value you get in the pixel position in the new output image
Advent of code always provides a simple input that serves as example. Then it provides you the real input to use to get the solution, that often contains some edge cases that the example-input lacks intentionally.
The simple input
..#.#..#####.#.#.#.###.##.....###.##.#..###.####..#####..#....#..#..##..###..######.###...####..#..#####..##..#.#####...##.#.#..#.##..#.#......#.###.######.###.####...#.##.##..#..#..#####.....#.#....###..#.##......#.....#..#..#..##..#...##.######.####.####.#.#...#.......#..#.#.#...####.##.#......#..#...##.#.##..#...##.#.##..###.#......#.#.......#.#.#.####.###.##...#.....####.#..#..#.##.#....##..#.####....##...##..#...#......#.#.......#.......##..####..#...#.#.#...##..#.#..###..#####........#..####......#..#
Image
#..#.
#....
##..#
..#..
..###
def parse_input(img):
# this could possibly be a set, instead of a dict.
grid = {}
for y, row in enumerate(img.splitlines()):
for x, cell in enumerate(row):
grid[x, y] = 1 if c=='#' else 0
return grid
def step(enhancement: AnyStr, grid: Dict[Iterable[int] , AnyStr], default: int=0):
min_x: int = min([x[0] for x in grid if grid[x]])
max_x: int = max([x[0] for x in grid if grid[x]])
min_y: int = min([x[1] for x in grid if grid[x]])
max_y: int = max([x[1] for x in grid if grid[x]])
new_grid = {}
for x in range(min_x-2,max_x+3):
for y in range(min_y-2,max_y+3):
n = 0 # the lookup number
for dx,dy in (
(-1,-1),(0,-1),(1,-1),
(-1, 0),(0, 0),(1, 0),
(-1, 1),(0, 1),(1, 1),):
n *= 2
n += grid.get((x+dx,y+dy),default)
new_grid[x,y] = 1 if enhancement[n] == '#' else 0
return new_grid
def part2(img, enhancement):
grid = parse_input(img)
for i in range(50):
grid = step(
enhancement,
grid,
default=i%2 if enhancement[0]=='#' else 0
)
return sum(grid.values())
print("part2", part2(img, enhancement))
print(timeit.timeit(
"part2(img, enhancement)",
"from __main__ import part2, img, enhancement",
number=1
))
part2 19340
6.5132104279999794
Spoiler alert: Yes.
Nope (well, you have to add decorators)
from numba import njit
njit
is an alias to @jit(nopython=True)
@njit
def parse_input(img):
...
@njit
def step(enhancement: AnyStr, grid: Dict[Iterable[int] , AnyStr], default: int=0):
...
@njit
def part2(img, enhancement):
...
print("part2", part2(img, enhancement))
print(timeit.timeit(
"part2(img, enhancement)",
"from __main__ import part2, img, enhancement",
number=1
))
part2 19340
0.6961225479999484
6.51s -> 0.70s -> 9.3 times faster
With cuda
we can use the GPU of our machine (or a colab instance) to work on this problem, that can be parallelized quite well.
Yes.
Yes, quite a bit.
- I will not use a dictionary anymore
- I will use an array
- I will use a fixed size array, big enough to contain my final image
- On the python side I will use a numpy array
- I will not allocate a new array (previously a dictionary) at every loop
- To be fair with numba, I will rewrite the numba solution with the same new logic
from numba import njit, cuda
import numpy as np
@njit
def parse_input(img):
ret = np.full((200, 200), False)
for y, row in enumerate(img.splitlines()):
for x, cell in enumerate(row):
ret[50 + x, 50 + y] = cell=='#'
return ret
ret
is 200 by 200 array of booleans,- 50 is the offset to avoid negative numbers, because the image also expands in negatie directions, one pixel per step
@cuda.jit
def GPUstep(ingrid, outgrid, enhancement, default):
x, y = cuda.grid(2)
if x<ingrid.shape[0] and y<ingrid.shape[1]:
n = 0
for dx,dy in (
(-1,-1),(0,-1),(1,-1),
(-1, 0),(0, 0),(1, 0),
(-1, 1),(0, 1),(1, 1),):
n *= 2
if (not 0<=x+dx<ingrid.shape[0]) or (not 0<=y+dy<ingrid.shape[1]):
n += default
else:
n += 1 if ingrid[x+dx,y+dy] else 0
outgrid[x][y] = enhancement[n]
def part2GPU(img, enhancement):
np_grid = parse_input(img)
d_grid = cuda.to_device(np_grid)
new_grid = cuda.to_device(
np.full((np_grid.shape[0], np_grid.shape[1]), False)
)
d_enha = cuda.to_device(np.fromiter((x=='#' for x in enhancement), bool))
threads_per_block = (16, 16)
blocks_per_grid_x = (np_grid.shape[0] + threads_per_block[0]
- 1) // threads_per_block[0]
blocks_per_grid_y = (np_grid.shape[1] + threads_per_block[1]
- 1) // threads_per_block[1]
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range(50):
GPUstep[blocks_per_grid, threads_per_block] \
(d_grid, new_grid, d_enha, i%2 if ench[0]=='#' else 0)
d_grid, new_grid = new_grid, d_grid
return np.count_nonzero(d_grid)
print("GPU part2", part2GPU(img, enhancement))
print(timeit.timeit(
"part2GPU(img, enhancement)",
"from __main__ import part2GPU, img, enhancement",
number=1
))
GPU part2 19340
0.008758712000002333
6.51s -> 0.0088s -> 740 times faster
@njit
def step(enhancement, grid, default=0):
new_grid = np.full((200, 200), False)
for x in range(grid.shape[0]):
for y in range(grid.shape[1]):
n = 0
for dx,dy in (
(-1,-1),(0,-1),(1,-1),
(-1, 0),(0, 0),(1, 0),
(-1, 1),(0, 1),(1, 1),):
n *= 2
if (not 0<=x+dx<grid.shape[0]) or (not 0<=y+dy<grid.shape[1]):
n += default
else:
n += 1 if grid[x+dx,y+dy] else 0
new_grid[x,y] = enhancement[n] == '#'
return new_grid
@njit
def part2(img, enhancement):
grid = parse_input(img)
for i in range(50):
grid = step(enhancement, grid, default=i%2 if enhancement[0]=='#' else 0)
return np.count_nonzero(grid)
print("jit part2", part2(img, enhancement))
print(timeit.timeit(
"part2(img, enhancement)",
"from __main__ import part2, img, enhancement",
number=1
))
jit part2 19340
0.13827341399999682
6.51s -> 0.14s -> 46 times faster
0.70s -> 0.14s -> 5 times faster
but not handling any-size-images