设置了arch=ti.gpu但是GPU却没有利用，还是用的cpu是什么问题

zy00 · 2022 年3 月 30 日 13:23

import taichi as ti

ti.init(arch=ti.gpu)

use_mc = False
mc_clipping = False
pause = False

# Runge-Kutta order
rk = 1

n = 512
x = ti.field(ti.f32, shape=(n, n))
new_x = ti.field(ti.f32, shape=(n, n))
new_x_aux = ti.field(ti.f32, shape=(n, n))
dx = 1 / n
inv_dx = 1 / dx
dt = 0.05

stagger = ti.Vector([0.5, 0.5])


@ti.func
def Vector2(x, y):
    return ti.Vector([x, y])


@ti.func
def inside(p, c, r):
    return (p - c).norm_sqr() <= r * r


@ti.func
def inside_taichi(p):
    p = Vector2(0.5, 0.5) + (p - Vector2(0.5, 0.5))*1.2
    ret = -1
    if not inside(p, Vector2(0.50, 0.50), 0.55):
        if ret == -1:
            ret = 0
    if not inside(p, Vector2(0.50, 0.50), 0.50):
        if ret == -1:
            ret = 1
    if inside(p, Vector2(0.50, 0.25), 0.09):
        if ret == -1:
            ret = 1
    if inside(p, Vector2(0.50, 0.75), 0.09):
        if ret == -1:
            ret = 0
    if inside(p, Vector2(0.50, 0.25), 0.25):
        if ret == -1:
            ret = 0
    if inside(p, Vector2(0.50, 0.75), 0.25):
        if ret == -1:
            ret = 1
    if p[0] < 0.5:
        if ret == -1:
            ret = 1
    else:
        if ret == -1:
            ret = 0
    return ret


@ti.kernel
def paint():
    for i, j in ti.ndrange(n * 4, n * 4):
        ret = 1 - inside_taichi(Vector2(i / n / 4, j / n / 4))
        x[i // 4, j // 4] += ret / 16


@ti.func
def velocity(p):
    return ti.Vector([p[1] - 0.5, 0.5 - p[0]])


@ti.func
def vec(x, y):
    return ti.Vector([x, y])


@ti.func
def clamp(p):
    for d in ti.static(range(p.n)):
        p[d] = min(1 - 1e-4 - dx + stagger[d] * dx, max(p[d], stagger[d] * dx))
    return p


@ti.func
def sample_bilinear(x, p):
    p = clamp(p)

    p_grid = p * inv_dx - stagger

    I = ti.cast(ti.floor(p_grid), ti.i32)
    f = p_grid - I
    g = 1 - f

    return x[I] * (g[0] * g[1]) + x[I + vec(1, 0)] * (
            f[0] * g[1]) + x[I + vec(0, 1)] * (
                   g[0] * f[1]) + x[I + vec(1, 1)] * (f[0] * f[1])


@ti.func
def sample_min(x, p):
    p = clamp(p)
    p_grid = p * inv_dx - stagger
    I = ti.cast(ti.floor(p_grid), ti.i32)

    return min(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)])


@ti.func
def sample_max(x, p):
    p = clamp(p)
    p_grid = p * inv_dx - stagger
    I = ti.cast(ti.floor(p_grid), ti.i32)

    return max(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)])


@ti.func
def backtrace(I, dt):
    p = (I + stagger) * dx
    #p = I * dx
    if ti.static(rk == 1):
        p -= dt * velocity(p)
    elif ti.static(rk == 2):
        p_mid = p - 0.5 * dt * velocity(p)
        p -= dt * velocity(p_mid)
    elif ti.static(rk == 3):
        v1 = velocity(p)
        p1 = p - 0.5 * dt * v1
        v2 = velocity(p1)
        p2 = p - 0.75 * dt * v2
        v3 = velocity(p2)
        p -= dt * (2 / 9 * v1 + 1 / 3 * v2 + 4 / 9 * v3)
    else:
        ti.static_print(f"RK{rk} is not supported.")

    return p


@ti.func
def semi_lagrangian(x, new_x, dt):
    # Note: this loop is parallelized
    for I in ti.grouped(x):
        new_x[I] = sample_bilinear(x, backtrace(I, dt))


# Reference: https://github.com/ziyinq/Bimocq/blob/master/src/bimocq2D/BimocqSolver2D.cpp

@ti.func
def maccormack(x, dt):
    semi_lagrangian(x, new_x, dt)
    semi_lagrangian(new_x, new_x_aux, -dt)

    for I in ti.grouped(x):
        new_x[I] = new_x[I] + 0.5 * (x[I] - new_x_aux[I])

        if ti.static(mc_clipping):
            source_pos = backtrace(I, dt)
            min_val = sample_min(x, source_pos)
            max_val = sample_max(x, source_pos)

            if new_x[I] < min_val or new_x[I] > max_val:
                new_x[I] = sample_bilinear(x, source_pos)


@ti.kernel
def advect():
    if ti.static(use_mc):
        maccormack(x, dt)
    else:
        semi_lagrangian(x, new_x, dt)

    for I in ti.grouped(x):
        x[I] = new_x[I]


paint()

gui = ti.GUI('Advection schemes', (512, 512))

while True:
    while gui.get_event(ti.GUI.PRESS):
        if gui.event.key in [ti.GUI.ESCAPE, ti.GUI.EXIT]: exit(0)
        if gui.event.key == ti.GUI.SPACE:
            pause = not pause
    if not pause:
        for i in range(1):
            advect()
    gui.set_image(x.to_numpy())
    gui.show()

GPU利用率0%是什么原因呢

YuPeng · 2022 年3 月 30 日 15:18

你可以运行代码之后命令行的信息贴出来么？比如我的：

[Taichi] version 0.9.3, llvm 10.0.0, commit eee0e7c1, osx, python 3.8.11
[Taichi] Starting on arch=metal

zy00 · 2022 年3 月 31 日 01:37

zy00 · 2022 年3 月 31 日 01:38

YuPeng · 2022 年3 月 31 日 02:51

从你的截图可以看出来是使用了GPU的，我跑了程序在我的机器上也是用了GPU而且使用率也上升了。你可以运行一些其他比较复杂的Taichi程序，然后看一下GPU的使用。比如：

python3 -m taichi example stable_fluid

zy00 · 2022 年3 月 31 日 03:13

我跑了difftaichi里面的例子，只有3-4帧，GPU利用率依然是0%

zy00 · 2022 年3 月 31 日 03:14

YuPeng · 2022 年3 月 31 日 12:59

很奇怪哎，那你尝试一下其他后端比如OpenGL试试呢？

zy00 · 2022 年3 月 31 日 13:22

我跑了games201一位助教同学写的shp,wcsph，pbd流体模拟的代码，用到了cuda
GPU 是有被利用的

zy00 · 2022 年4 月 4 日 02:24

请问有什么方法可以排查出我这边的问题吗

mzhang · 2022 年4 月 6 日 12:04

你好, diffmpm.py这个例子，使用gui可视化会大大降低GPU使用率，因为这个gui是cpu based的，需要GPU拷贝数据到cpu；可以尝试把376～377行给注释了，看看GPU使用率是否有上升

zy00 · 2022 年4 月 6 日 12:29

你好，感谢你的回复，我尝试了注释367~377行，运行diffmpm之后GPU利用率在输出完

之后突然上升到了70%多，然后出现了一个警告D:\Software\anaconda3\envs\difftaichi\lib\site-packages\taichi\lang\common_ops.py:21: DeprecationWarning: a.atomic_add(b) is deprecated. Please use ti.atomic_add(a, b) instead.
DeprecationWarning)
开始迭代之后GPU利用率又是0%，只有刚运行时闪了一下70%

mzhang · 2022 年4 月 6 日 12:42

这个警告和GPU利用率关系不大，我这边自己测3090 gpu利用率能到65%左右，请问你那边能用nvidia-smi看一下吗

zy00 · 2022 年4 月 6 日 13:19

你好，我用nvidia-smi看了GPU利用率是有到41%

但是具体这个程序占了多少没有显示，请问这就表明了是没有问题的是吗

zy00 · 2022 年4 月 6 日 13:33

你好，我发现我应该是有用到GPU的，可能是任务管理器显示出了问题，我刚才查看了GPU compute部分也是有很高的利用率，感谢您的帮助，麻烦大家了

mzhang · 2022 年4 月 6 日 13:35

没事，解决了就好。猜测是不是因为你电脑还有一块核显，然后任务管理器的占用率可能测的是核显？

zy00 · 2022 年4 月 6 日 13:51

我只有一块独显 GTX980Ti，我刚才也查了一下，没有找到具体的原因