calc loss

GnibChen · Jan 31, 2022 · dac0bb0 · dac0bb0
1 parent 7601435
commit dac0bb0
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 19 deletions.
diff --git a/09/01_texture/01/CudaArray.cuh b/09/01_texture/01/CudaArray.cuh
@@ -199,3 +199,17 @@ public:
         return {m_impl->m_cuTex};
     }
 };
+
+template <class T>
+struct CudaAST {
+    CudaArray<T> arr;
+    CudaSurface<T> suf;
+    CudaTexture<T> tex;
+
+    CudaAST(ctor_t, typename CudaArray<T>::BuildArgs const &_arrArgs, typename CudaTexture<T>::BuildArgs const &_texArgs = {})
+        : arr(ctor, _arrArgs)
+        , suf(ctor, arr)
+        , tex(ctor, arr, _texArgs)
+    {
+    }
+};
diff --git a/09/01_texture/01/main.cu b/09/01_texture/01/main.cu
@@ -29,19 +29,67 @@ __global__ void resample_kernel(CudaSurface<float4>::Accessor sufLoc, CudaTextur
     sufClrNext.write(clr, x, y, z);
 }
 
-template <class T>
-struct CudaAST {
-    CudaArray<T> arr;
-    CudaSurface<T> suf;
-    CudaTexture<T> tex;
-
-    CudaAST(ctor_t, typename CudaArray<T>::BuildArgs const &_arrArgs, typename CudaTexture<T>::BuildArgs const &_texArgs = {})
-        : arr(ctor, _arrArgs)
-        , suf(ctor, arr)
-        , tex(ctor, arr, _texArgs)
-    {
-    }
-};
+__global__ void divergence_kernel(CudaSurface<float4>::Accessor sufVel, CudaSurface<float>::Accessor sufDiv, unsigned int n) {
+    unsigned int x = threadIdx.x + blockDim.x * blockIdx.x;
+    unsigned int y = threadIdx.y + blockDim.y * blockIdx.y;
+    unsigned int z = threadIdx.z + blockDim.z * blockIdx.z;
+    if (x >= n || y >= n || z >= n) return;
+
+    float vxp = sufVel.read<cudaBoundaryModeClamp>(x + 1, y, z).x;
+    float vxn = sufVel.read<cudaBoundaryModeClamp>(x - 1, y, z).x;
+    float vyp = sufVel.read<cudaBoundaryModeClamp>(x, y + 1, z).y;
+    float vyn = sufVel.read<cudaBoundaryModeClamp>(x, y - 1, z).y;
+    float vzp = sufVel.read<cudaBoundaryModeClamp>(x, y, z + 1).z;
+    float vzn = sufVel.read<cudaBoundaryModeClamp>(x, y, z - 1).z;
+    float div = (vxp - vxn + vyp - vyn + vzp - vzn) * 0.5f;
+    sufDiv.write(div, x, y, z);
+}
+
+__global__ void sumloss_kernel(CudaSurface<float>::Accessor sufDiv, float *sum, unsigned int n) {
+    unsigned int x = threadIdx.x + blockDim.x * blockIdx.x;
+    unsigned int y = threadIdx.y + blockDim.y * blockIdx.y;
+    unsigned int z = threadIdx.z + blockDim.z * blockIdx.z;
+    if (x >= n || y >= n || z >= n) return;
+
+    float div = sufDiv.read(x, y, z);
+    atomicAdd(sum, div * div);
+}
+
+__global__ void jacobi_kernel(CudaSurface<float>::Accessor sufDiv, CudaSurface<float>::Accessor sufPre, CudaSurface<float>::Accessor sufPreNext, unsigned int n) {
+    unsigned int x = threadIdx.x + blockDim.x * blockIdx.x;
+    unsigned int y = threadIdx.y + blockDim.y * blockIdx.y;
+    unsigned int z = threadIdx.z + blockDim.z * blockIdx.z;
+    if (x >= n || y >= n || z >= n) return;
+
+    float pxp = sufPre.read<cudaBoundaryModeClamp>(x + 1, y, z);
+    float pxn = sufPre.read<cudaBoundaryModeClamp>(x - 1, y, z);
+    float pyp = sufPre.read<cudaBoundaryModeClamp>(x, y + 1, z);
+    float pyn = sufPre.read<cudaBoundaryModeClamp>(x, y - 1, z);
+    float pzp = sufPre.read<cudaBoundaryModeClamp>(x, y, z + 1);
+    float pzn = sufPre.read<cudaBoundaryModeClamp>(x, y, z - 1);
+    float div = sufDiv.read(x, y, z);
+    float preNext = (pxp + pxn + pyp + pyn + pzp + pzn - div) * (1.f / 6.f);
+    sufPreNext.write(preNext, x, y, z);
+}
+
+__global__ void subgradient_kernel(CudaSurface<float>::Accessor sufPre, CudaSurface<float4>::Accessor sufVel, unsigned int n) {
+    unsigned int x = threadIdx.x + blockDim.x * blockIdx.x;
+    unsigned int y = threadIdx.y + blockDim.y * blockIdx.y;
+    unsigned int z = threadIdx.z + blockDim.z * blockIdx.z;
+    if (x >= n || y >= n || z >= n) return;
+
+    float pxp = sufPre.read<cudaBoundaryModeClamp>(x + 1, y, z);
+    float pxn = sufPre.read<cudaBoundaryModeClamp>(x - 1, y, z);
+    float pyp = sufPre.read<cudaBoundaryModeClamp>(x, y + 1, z);
+    float pyn = sufPre.read<cudaBoundaryModeClamp>(x, y - 1, z);
+    float pzp = sufPre.read<cudaBoundaryModeClamp>(x, y, z + 1);
+    float pzn = sufPre.read<cudaBoundaryModeClamp>(x, y, z - 1);
+    float4 vel = sufVel.read(x, y, z);
+    vel.x -= 0.5f * (pxp - pxn);
+    vel.y -= 0.5f * (pyp - pyn);
+    vel.z -= 0.5f * (pzp - pzn);
+    sufVel.write(vel, x, y, z);
+}
 
 struct SmokeSim {
     nocopy_t nocopy;
@@ -52,14 +100,20 @@ struct SmokeSim {
     CudaAST<float4> velNext;
     CudaAST<float4> clr;
     CudaAST<float4> clrNext;
+    CudaAST<float> div;
+    CudaAST<float> pre;
+    CudaAST<float> preNext;
 
-    SmokeSim(unsigned int _n)
+    SmokeSim(ctor_t, unsigned int _n)
     : n(_n)
     , loc(ctor, {{n, n, n}})
     , vel(ctor, {{n, n, n}})
     , velNext(ctor, {{n, n, n}})
     , clr(ctor, {{n, n, n}})
     , clrNext(ctor, {{n, n, n}})
+    , div(ctor, {{n, n, n}})
+    , pre(ctor, {{n, n, n}})
+    , preNext(ctor, {{n, n, n}})
     {}
 
     void advection() {
@@ -70,19 +124,40 @@ struct SmokeSim {
         std::swap(vel, velNext);
         std::swap(clr, clrNext);
     }
+
+    void projection(int times = 400) {
+        divergence_kernel<<<dim3((n + 7) / 8, (n + 7) / 8, (n + 7) / 8), dim3(8, 8, 8)>>>(vel.suf.access(), div.suf.access(), n);
+
+        for (int step = 0; step < times; step++) {
+            jacobi_kernel<<<dim3((n + 7) / 8, (n + 7) / 8, (n + 7) / 8), dim3(8, 8, 8)>>>(div.suf.access(), pre.suf.access(), preNext.suf.access(), n);
+            std::swap(pre, preNext);
+        }
+
+        subgradient_kernel<<<dim3((n + 7) / 8, (n + 7) / 8, (n + 7) / 8), dim3(8, 8, 8)>>>(pre.suf.access(), vel.suf.access(), n);
+    }
+
+    float calc_loss() {
+        divergence_kernel<<<dim3((n + 7) / 8, (n + 7) / 8, (n + 7) / 8), dim3(8, 8, 8)>>>(vel.suf.access(), div.suf.access(), n);
+        float *sum;
+        checkCudaErrors(cudaMalloc(&sum, sizeof(float)));
+        sumloss_kernel<<<dim3((n + 7) / 8, (n + 7) / 8, (n + 7) / 8), dim3(8, 8, 8)>>>(div.suf.access(), sum, n);
+        float cpu;
+        checkCudaErrors(cudaMemcpy(&cpu, sum, sizeof(float), cudaMemcpyDeviceToHost));
+        checkCudaErrors(cudaFree(sum));
+        return cpu;
+    }
 };
 
 int main() {
     unsigned int n = 64;
-
-    SmokeSim sim(n);
+    SmokeSim sim(ctor, n);
 
     {
         std::vector<float4> cpu(n * n * n);
         for (unsigned int z = 0; z < n; z++) {
             for (unsigned int y = 0; y < n; y++) {
                 for (unsigned int x = 0; x < n; x++) {
-                    float den = std::hypot((int)x - (int)n / 2, (int)y - (int)n / 2, (int)z - (int)n / 2) < n / 2 ? 1.f : 0.f;
+                    float den = std::hypot((int)x - (int)n / 2, (int)y - (int)n / 2, (int)z - (int)n / 2) < n / 3 ? 1.f : 0.f;
                     cpu[x + n * (y + n * z)] = make_float4(den, 0.f, 0.f, 0.f);
                 }
             }
@@ -95,7 +170,8 @@ int main() {
         for (unsigned int z = 0; z < n; z++) {
             for (unsigned int y = 0; y < n; y++) {
                 for (unsigned int x = 0; x < n; x++) {
-                    cpu[x + n * (y + n * z)] = make_float4(0.1f, 0.f, 0.f, 0.f);
+                    float vel = std::hypot((int)x - (int)n / 2, (int)y - (int)n / 2, (int)z - (int)n / 2) < n / 3 ? 0.5f : 0.f;
+                    cpu[x + n * (y + n * z)] = make_float4(vel, 0.f, 0.f, 0.f);
                 }
             }
         }
@@ -107,8 +183,9 @@ int main() {
         sim.clr.arr.copyOut(cpu.data());
         writevdb<float, 1>("/tmp/a" + std::to_string(1000 + frame).substr(1) + ".vdb", cpu.data(), n, n, n, sizeof(float4));
 
-        printf("frame=%d\n", frame);
+        printf("frame=%d, loss=%f\n", frame, sim.calc_loss());
         sim.advection();
+        sim.projection();
     }
 
     return 0;