Error: ti.static can only be used inside Taichi kernels

I modified the file diffmpm_simple, putting p2g, grid_op, and g2p together in one kernel substep:

import taichi as ti
import numpy as np
import cv2
import os
import h5py
import matplotlib.pyplot as plt
import torch

real = ti.f32
ti.set_default_fp(real)
ti.get_runtime().set_verbose(True)

bound = 3
dim = 2
n_particles = 256
N = 80
n_grid = 120
dx = 1 / n_grid
inv_dx = 1 / dx
dt = 3e-4
p_mass = 1
p_vol = 1
E = 100
mu = E
la = E
max_steps = 1024
steps = 1024
gravity = 9.8

scalar = lambda: ti.var(dt=real)
vec = lambda: ti.Vector(dim, dt=real)
mat = lambda: ti.Matrix(dim, dim, dt=real)

x = ti.Vector(dim, dt=real, shape=(n_particles), needs_grad=True)
v = ti.Vector(dim, dt=real, shape=(n_particles), needs_grad=True)
grid_v_in = ti.Vector(dim, dt=real, shape=(n_grid, n_grid), needs_grad=True)
grid_v_out = ti.Vector(dim, dt=real, shape=(n_grid, n_grid), needs_grad=True)
grid_m_in = ti.var(dt=real, shape=(n_grid, n_grid), needs_grad=True)
C = ti.Matrix(dim, dim, dt=real, shape=(n_particles), needs_grad=True)
F = ti.Matrix(dim, dim, dt=real, shape=(n_particles), needs_grad=True)
init_v = ti.Vector(dim, dt=real, shape=(), needs_grad=True)

ti.cfg.arch = ti.cuda

@ti.kernel
def random_init():
    init_v[None] = [ti.random(ti.float32)*2-1, ti.random(ti.float32)*2-1]
    for i in range(n_particles):
        v[i] = init_v

@ti.kernel
def substep():
    grid_v_in.fill(0)
    grid_m_in.fill(0)
    for p in range(0, n_particles):
        base = ti.cast(x[p] * inv_dx - 0.5, ti.i32)
        fx = x[p] * inv_dx - ti.cast(base, ti.i32)
        w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1),
             0.5 * ti.sqr(fx - 0.5)]
        new_F = (ti.Matrix.diag(dim=2, val=1) + dt * C[p]) @ F[p]
        F[p] = new_F
        J = ti.determinant(new_F)
        r, s = ti.polar_decompose(new_F)
        cauchy = 2 * mu * (new_F - r) @ ti.transposed(new_F) + \
                 ti.Matrix.diag(2, la * (J - 1) * J)
        stress = -(dt * p_vol * 4 * inv_dx * inv_dx) * cauchy
        affine = stress + p_mass * C[p]
        for i in ti.static(range(3)):
            for j in ti.static(range(3)):
                offset = ti.Vector([i, j])
                dpos = (ti.cast(ti.Vector([i, j]), real) - fx) * dx
                weight = w[i](0) * w[j](1)
                grid_v_in[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
                grid_m_in[base + offset] += weight * p_mass
    for p in range(n_grid * n_grid):
        i = p // n_grid
        j = p - n_grid * i
        inv_m = 1 / (grid_m_in[i, j] + 1e-10)
        v_out = inv_m * grid_v_in[i, j]
        v_out[1] -= dt * gravity
        if i < bound and v_out[0] < 0:
            v_out[0] = 0
        if i > n_grid - bound and v_out[0] > 0:
            v_out[0] = 0
        if j < bound and v_out[1] < 0:
            v_out[1] = 0
        if j > n_grid - bound and v_out[1] > 0:
            v_out[1] = 0
        grid_v_out[i, j] = v_out
    for p in range(n_particles):
        base = ti.cast(x[p] * inv_dx - 0.5, ti.i32)
        fx = x[p] * inv_dx - ti.cast(base, real)
        w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1.0),
             0.5 * ti.sqr(fx - 0.5)]
        new_v = ti.Vector([0.0, 0.0])
        new_C = ti.Matrix([[0.0, 0.0], [0.0, 0.0]])

        for i in ti.static(range(3)):
            for j in ti.static(range(3)):
                dpos = ti.cast(ti.Vector([i, j]), real) - fx
                g_v = grid_v_out[base(0) + i, base(1) + j]
                weight = w[i](0) * w[j](1)
                new_v += weight * g_v
                new_C += 4 * weight * ti.outer_product(g_v, dpos) * inv_dx

        v[p] = new_v
        x[p] = x[p] + dt * v[p]
        C[p] = new_C


for k in range(30):
    random_init()
    for i in range(N):
        for j in range(N):
            x[i * N + j] = [dx * (i * 0.5 + 10), dx * (j * 0.5 + 25)]
    C.fill(0)
    for i in range(n_particles):
        F[i] = [[1, 0], [0, 1]]
    x_list = [x.to_torch().squeeze(-1)]

    for s in range(steps - 1):
        substep()
        x_list.append(x.to_torch().squeeze(-1).float())  # to tensor is copy so no problem

    # visualize
    for s in range(63, steps, 64):
        scale = 4
        img = np.zeros(shape=(scale * n_grid, scale * n_grid)) + 0.3
        total = [0, 0]
        for i in range(n_particles):
            p_x = int(scale * x_list[s][i, 0] / dx)
            p_y = int(scale * x_list[s][i, 1] / dx)
            img[p_x, p_y] = 1
        img = img.swapaxes(0, 1)[::-1]
        cv2.imshow('MPM', img)
        cv2.waitKey(1)

I got the error saying ti.static can only be used inside Taichi kernels while ti.static is indeed in the kernel:

[Release mode]
[Taichi version 0.3.2, cpu only, commit 26635f82]
Initializing runtime with 48 elements
Runtime initialized.
Traceback (most recent call last):
  File "D:/learn_mpm/test.py", line 122, in <module>
    substep()
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 308, in __call__
    self.materialize(key=key, args=args, arg_features=self.mapper.extract(args))
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 233, in materialize
    taichi_kernel = taichi_kernel.define(taichi_ast_generator)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 230, in taichi_ast_generator
    compiled()
  File "D:/learn_mpm/test.py", line 68, in substep
    for i in ti.static(range(3)):
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\impl.py", line 252, in static
    ).inside_kernel, 'ti.static can only be used inside Taichi kernels'
AssertionError: ti.static can only be used inside Taichi kernels

Process finished with exit code 1

Is there any limit while putting such a large amount of codes in one kernel?

I would move

grid_v_in.fill(0)
grid_m_in.fill(0)

outside substep, as they actually indirectly invoke other kernels.

Sorry about the confusing error message. Nested kernels (those induced by fill inside substep) are not allowed: https://taichi.readthedocs.io/en/latest/hello.html#functions-and-kernels

Here’s the working script:

import taichi as ti
import numpy as np
import cv2
import os
import h5py
import matplotlib.pyplot as plt
import torch

real = ti.f32
ti.set_default_fp(real)
ti.get_runtime().set_verbose(True)

bound = 3
dim = 2
n_particles = 256
N = 80
n_grid = 120
dx = 1 / n_grid
inv_dx = 1 / dx
dt = 3e-4
p_mass = 1
p_vol = 1
E = 100
mu = E
la = E
max_steps = 1024
steps = 1024
gravity = 9.8

scalar = lambda: ti.var(dt=real)
vec = lambda: ti.Vector(dim, dt=real)
mat = lambda: ti.Matrix(dim, dim, dt=real)

x = ti.Vector(dim, dt=real, shape=(n_particles), needs_grad=True)
v = ti.Vector(dim, dt=real, shape=(n_particles), needs_grad=True)
grid_v_in = ti.Vector(dim, dt=real, shape=(n_grid, n_grid), needs_grad=True)
grid_v_out = ti.Vector(dim, dt=real, shape=(n_grid, n_grid), needs_grad=True)
grid_m_in = ti.var(dt=real, shape=(n_grid, n_grid), needs_grad=True)
C = ti.Matrix(dim, dim, dt=real, shape=(n_particles), needs_grad=True)
F = ti.Matrix(dim, dim, dt=real, shape=(n_particles), needs_grad=True)
init_v = ti.Vector(dim, dt=real, shape=(), needs_grad=True)

# ti.cfg.arch = ti.cuda

@ti.kernel
def random_init():
    init_v[None] = [ti.random(ti.float32)*2-1, ti.random(ti.float32)*2-1]
    for i in range(n_particles):
        v[i] = init_v

@ti.kernel
def substep():
    for p in range(0, n_particles):
        base = ti.cast(x[p] * inv_dx - 0.5, ti.i32)
        fx = x[p] * inv_dx - ti.cast(base, ti.i32)
        w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1),
             0.5 * ti.sqr(fx - 0.5)]
        new_F = (ti.Matrix.diag(dim=2, val=1) + dt * C[p]) @ F[p]
        F[p] = new_F
        J = ti.determinant(new_F)
        r, s = ti.polar_decompose(new_F)
        cauchy = 2 * mu * (new_F - r) @ ti.transposed(new_F) + \
                 ti.Matrix.diag(2, la * (J - 1) * J)
        stress = -(dt * p_vol * 4 * inv_dx * inv_dx) * cauchy
        affine = stress + p_mass * C[p]
        for i in ti.static(range(3)):
            for j in ti.static(range(3)):
                offset = ti.Vector([i, j])
                dpos = (ti.cast(ti.Vector([i, j]), real) - fx) * dx
                weight = w[i](0) * w[j](1)
                grid_v_in[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
                grid_m_in[base + offset] += weight * p_mass
    for p in range(n_grid * n_grid):
        i = p // n_grid
        j = p - n_grid * i
        inv_m = 1 / (grid_m_in[i, j] + 1e-10)
        v_out = inv_m * grid_v_in[i, j]
        v_out[1] -= dt * gravity
        if i < bound and v_out[0] < 0:
            v_out[0] = 0
        if i > n_grid - bound and v_out[0] > 0:
            v_out[0] = 0
        if j < bound and v_out[1] < 0:
            v_out[1] = 0
        if j > n_grid - bound and v_out[1] > 0:
            v_out[1] = 0
        grid_v_out[i, j] = v_out
    for p in range(n_particles):
        base = ti.cast(x[p] * inv_dx - 0.5, ti.i32)
        fx = x[p] * inv_dx - ti.cast(base, real)
        w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1.0),
             0.5 * ti.sqr(fx - 0.5)]
        new_v = ti.Vector([0.0, 0.0])
        new_C = ti.Matrix([[0.0, 0.0], [0.0, 0.0]])

        for i in ti.static(range(3)):
            for j in ti.static(range(3)):
                dpos = ti.cast(ti.Vector([i, j]), real) - fx
                g_v = grid_v_out[base(0) + i, base(1) + j]
                weight = w[i](0) * w[j](1)
                new_v += weight * g_v
                new_C += 4 * weight * ti.outer_product(g_v, dpos) * inv_dx

        v[p] = new_v
        x[p] = x[p] + dt * v[p]
        C[p] = new_C


for k in range(30):
    random_init()
    for i in range(N):
        for j in range(N):
            x[i * N + j] = [dx * (i * 0.5 + 10), dx * (j * 0.5 + 25)]
    C.fill(0)
    for i in range(n_particles):
        F[i] = [[1, 0], [0, 1]]
    x_list = [x.to_torch().squeeze(-1)]

    for s in range(steps - 1):
        grid_v_in.fill(0)
        grid_m_in.fill(0)
        substep()
        x_list.append(x.to_torch().squeeze(-1).float())  # to tensor is copy so no problem

    # visualize
    for s in range(63, steps, 64):
        scale = 4
        img = np.zeros(shape=(scale * n_grid, scale * n_grid)) + 0.3
        total = [0, 0]
        for i in range(n_particles):
            p_x = int(scale * x_list[s][i, 0] / dx)
            p_y = int(scale * x_list[s][i, 1] / dx)
            img[p_x, p_y] = 1
        img = img.swapaxes(0, 1)[::-1]
        cv2.imshow('MPM', img)
        cv2.waitKey(1)

Thank you! I did not notice that the fill function is actually a ti.kernel.

It indirectly calls a kernel which is metaprogrammed: https://github.com/yuanming-hu/taichi/blob/master/python/taichi/lang/meta.py#L6