From 2202c9a51bd1f9f4ed4dd495688207950226169f Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 2 May 2024 18:09:27 +0000 Subject: [PATCH] add kernel 4 to docs. have to improve these docs more and document them better --- dev/cuda/classifier_fused.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu index f03d2ce59..df6894113 100644 --- a/dev/cuda/classifier_fused.cu +++ b/dev/cuda/classifier_fused.cu @@ -10,6 +10,7 @@ nvcc -O3 --use_fast_math classifier_fused.cu -o classifier_fused ./classifier_fused 1 ./classifier_fused 2 ./classifier_fused 3 +./classifier_fused 4 */ #include @@ -448,7 +449,7 @@ __global__ void fused_classifier_kernel4(float* dlogits, float* losses, float* p // calculate the probability needed for the loss and update (single-threaded) if(threadIdx.x == 0) { float prob = expf(logits[idx * P + ix] - sp.Offset) * sp.Scale; - losses[idx] = -logf(prob); + losses[idx] = -logf(prob); } // very sensible default for dlosses is 1/(B*T), which is the uniform loss