forked from ggerganov/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4905cd0
commit f9d41c7
Showing
3 changed files
with
107 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
/** | ||
* Copyright (c) 2023 Nomic, Inc. All rights reserved. | ||
* | ||
* This software is licensed under the terms of the Software for Open Models License (SOM), | ||
* version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany | ||
* this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc. | ||
*/ | ||
|
||
#version 450 | ||
|
||
#include "common.comp" | ||
|
||
#define BLOCKS_IN_QUANT QK8_0 | ||
#define SIZE_OF_BLOCK sizeof_block_q8_0 | ||
#define N_ROWS 4 | ||
|
||
layout(local_size_x_id = 0) in; | ||
layout(local_size_y = 1) in; | ||
layout(local_size_z = 1) in; | ||
|
||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; | ||
layout (binding = 1) readonly buffer tensorInB { float inB[]; }; | ||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; | ||
|
||
layout (push_constant) uniform parameter { | ||
uint inAOff; | ||
uint inBOff; | ||
uint outOff; | ||
int ne00; | ||
int ne10; | ||
int ne0; | ||
int ne1; | ||
int ne01; | ||
int gqa; | ||
} pcs; | ||
|
||
#define ELS_PER_BLOCK 32 | ||
#define SIZE_OF_D 2 | ||
#define BLOCK_SIZE (ELS_PER_BLOCK + SIZE_OF_D) | ||
|
||
void main() { | ||
const uint r0 = gl_WorkGroupID.x; | ||
const uint r1 = gl_WorkGroupID.y; | ||
const uint im = gl_WorkGroupID.z; | ||
|
||
const uint x = r0 * (pcs.ne00/ELS_PER_BLOCK) * BLOCK_SIZE + pcs.inAOff; // Based from inA | ||
const uint y = r1 * pcs.ne10 + pcs.inBOff; // based from inB | ||
|
||
float sumf = 0.0f; | ||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { | ||
const uint block_number = i / ELS_PER_BLOCK; | ||
const uint block_offset = block_number * BLOCK_SIZE; | ||
const float d = u8BufToFloat16(inA, x + block_offset); | ||
const uint position_in_block = i % ELS_PER_BLOCK; | ||
const int q = int8_t(inA[x+block_offset+SIZE_OF_D+position_in_block]); | ||
const float dq = d * q; | ||
sumf += dq * float(inB[y+i]); | ||
} | ||
|
||
const float all_sum = subgroupAdd(sumf); | ||
if (subgroupElect()) { | ||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum; | ||
} | ||
} |