# 使用稀疏数据结构加速B^T@A@B

``````import taichi as ti
ti.init(arch=ti.cpu, kernel_profiler=True)

n = 500
m = 200
A = ti.field(ti.f32)
B = ti.field(ti.f32)
A_bm = ti.root.bitmasked(ti.ij, n) # A: (n x n)
A_bm.place(A)
B_bm = ti.root.bitmasked(ti.ij, (n, m)) # B: (n x m)
B_bm.place(B)
result = ti.field(ti.f32, shape=(m,m))

for i in range(n):
A[i, i] = 1.

for i in range(m):
B[i, i] = 1.

@ti.kernel
def calculate():
for i, j in ti.ndrange(m, m): # 计算 B^T @ A @ B, 结果为result
result[i, j] = 0.
for t in range(n):
s = 0.
for k in range(n):
if ti.is_active(A_bm, [k,j]) and ti.is_active(B_bm, [t, k]):
s += A[t, k] * B[k, j]
s = s * B[t, i]
result[i, j] += s

calculate()
ti.print_kernel_profile_info()
``````

1 Like

Hi,

1. `block_shared_memory`(Cuda上的shared memory)，可以看看这里
2. 调整一下`ti.block_dim`(Cuda上的block number), 也可能有提升
3. 调整一下`A, B` 的SNode结构，如何合理的访问内存也很关键
1. 如果`A, B`不大的话，甚至可以直接粗暴的用`ti.dense`，有时候反而比sparse来的快
4. 用async_mode

1 Like

（另外补充一下，稀疏线性代数和matrix-free的技术计算是两个逻辑，二者的稀疏性和计算模式都有很大不同。）

1 Like

1 Like

``````import numpy as np
x = 10
y = 5

K = np.random.rand(10,10)
I = np.random.rand(10, 5)

A0 = I.transpose() @ K @ I

A1 = np.zeros((y, y))
for i in range(x):
for j in range(y): # 前两个循环在使用taichi的时候换成struct for
if I[i,j] != 0.:
for t in range(x):
for m in range(y):
if I[t, m] != 0.:
A1[j, m] += I[i,j] * K[i,t] * I[t, m]

print(np.linalg.norm(A1 - A0))
``````