diff --git a/.gitignore b/.gitignore index dd641ed..e8689cf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,6 @@ build/ # MacOS Cache .DS_Store -vscode/ +.vscode/ models/ diff --git a/include/modules/llamaAttention.h b/include/modules/llamaAttention_fp32.h similarity index 100% rename from include/modules/llamaAttention.h rename to include/modules/llamaAttention_fp32.h diff --git a/include/modules/llamaAttention_int4.h b/include/modules/llamaAttention_int4.h new file mode 100644 index 0000000..c8c6501 --- /dev/null +++ b/include/modules/llamaAttention_int4.h @@ -0,0 +1,51 @@ +#include +#include +#include +#include + +#include "common.h" +#include "operators.h" +#include "utlis.h" + +struct Int4llamaAttention_output { + Matrix3D attn_output; + Matrix3D attn_probs_reshaped; + std::pair, Matrix3D> past_key_value; +}; +struct Int4llamaAttention_input { + Matrix3D hidden_states; + Matrix3D attention_mask; + Matrix3D past_key, past_value; + bool has_past_key_value = false; + int layer_idx; + + Int4llamaAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) + : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} + + Int4llamaAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, + Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) + : hidden_states(hidden_states_), + attention_mask(attention_mask_), + past_key(past_key_), + past_value(past_value_), + has_past_key_value(has_past_key_value_), + layer_idx(layer_idx_) {} +}; + + +class Int4llamaAttention { + public: + Int4llamaAttention(std::string param_path, const struct model_config config); + Int4llamaAttention() {} + // static void initialized_memory(const struct model_config config); + // struct Int4llamaAttention_output forward(const struct Int4llamaAttention_input &input); + + private: + void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); + void shape(Matrix3D unshape, Matrix3D shaped, int sqlen); + int embed_dim, num_heads, head_dim; + // Linear_FP_int4 k_proj, v_proj, q_proj, o_proj; + // RotaryPosEmb rotary_pos_emb; + // BMM_F32T qk_bmm, pv_bmm; + std::string profile_name = "Int4llamaAttention"; +}; diff --git a/include/modules/llamaDecoder.h b/include/modules/llamaDecoder_fp32.h similarity index 90% rename from include/modules/llamaDecoder.h rename to include/modules/llamaDecoder_fp32.h index 2673e1c..190425a 100644 --- a/include/modules/llamaDecoder.h +++ b/include/modules/llamaDecoder_fp32.h @@ -1,10 +1,10 @@ -// #include -// #include -// #include +#include +#include +#include -// #include "llamaDecoderLayer.h" -// #include "common.h" -// #include "operators.h" +#include "llamaDecoderLayer_fp32.h" +#include "common.h" +#include "operators.h" // struct Fp32llamaDecoder_output { // Matrix3D last_hidden_state; diff --git a/include/modules/llamaDecoder_int4.h b/include/modules/llamaDecoder_int4.h new file mode 100644 index 0000000..5929347 --- /dev/null +++ b/include/modules/llamaDecoder_int4.h @@ -0,0 +1,37 @@ +#include "llamaDecoderlayer_int4.h" + +struct Int4llamaDecoder_output { + Matrix3D last_hidden_state; + std::vector> past_keys, past_values; +}; +struct Int4llamaDecoder_input { + Matrix3D input_ids; + std::vector> past_keys, past_values; + bool has_past_keys_values; + + Int4llamaDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } + Int4llamaDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, + std::vector> past_values_) + : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { + has_past_keys_values = true; + } +}; + +class Int4llamaDecoder { + public: + Int4llamaDecoder(std::string param_path, const struct model_config config); + Int4llamaDecoder(){}; + // Matrix3D prepare_decoder_attention_mask(int length, int past_length); + // struct Int4llamaDecoder_output forward(const struct Int4llamaDecoder_input& input); + // Embedding embed_tokens; + // LlamaRMSNorm norm; + int voc_size, embed_dim, padding_idx, hidden_dim, num_heads; + std::vector layers; + std::string profile_name = "Int4llamaDecoder"; + + private: + float* attention_mask_buf; + float* pos_embeds_buf; + float* last_hidden_states_buf; + float* hidden_states_buf; +}; diff --git a/include/modules/llamaDecoderlayer.h b/include/modules/llamaDecoderlayer_fp32.h similarity index 95% rename from include/modules/llamaDecoderlayer.h rename to include/modules/llamaDecoderlayer_fp32.h index e4b6d84..30549ca 100644 --- a/include/modules/llamaDecoderlayer.h +++ b/include/modules/llamaDecoderlayer_fp32.h @@ -1,6 +1,5 @@ -// #include "llamaAttention.h" -// #include "common.h" -// #include "operators.h" +#include "llamaAttention_fp32.h" + // struct Fp32llamaDecoderLayer_output { // Matrix3D hidden_states; diff --git a/include/modules/llamaDecoderlayer_int4.h b/include/modules/llamaDecoderlayer_int4.h new file mode 100644 index 0000000..37c535f --- /dev/null +++ b/include/modules/llamaDecoderlayer_int4.h @@ -0,0 +1,51 @@ +#include "llamaAttention_int4.h" + +struct Int4llamaDecoderLayer_output +{ + Matrix3D hidden_states; + Matrix3D attentions; + std::pair, Matrix3D> past_key_value; + + Int4llamaDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, + std::pair, Matrix3D> past_key_value_) + { + hidden_states = hidden_states_; + attentions = attentions_; + past_key_value = past_key_value_; + }; +}; +struct Int4llamaDecoderLayer_input +{ + Matrix3D hidden_states; + Matrix3D attention_mask; + Matrix3D past_key, past_value; + bool has_past_key_value = false; + + Int4llamaDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) + { + hidden_states = hidden_states_; + attention_mask = attention_mask_; + has_past_key_value = false; + } + Int4llamaDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, + Matrix3D past_key_, Matrix3D past_value_) + { + hidden_states = hidden_states_; + attention_mask = attention_mask_; + past_key = past_key_; + past_value = past_value_; + has_past_key_value = true; + } +}; + +class Int4llamaDecoderLayer { + public: + Int4llamaDecoderLayer(std::string param_path, const struct model_config config, int layer_idx); + struct Int4llamaDecoderLayer_output forward(const struct Int4llamaDecoderLayer_input &input); + + int embed_dim, num_attention_heads, hidden_dim, layer_idx; + // LlamaRMSNorm input_layernorm, post_attention_layernorm; // from torch_int.nn + // Linear_FP_int4 gate_proj, down_proj, up_proj; + Int4llamaAttention attn; + std::string profile_name = "Int4llamaDecoderLayer"; +}; diff --git a/include/modules/llamaForCausalLM.h b/include/modules/llamaForCausalLM_fp32.h similarity index 50% rename from include/modules/llamaForCausalLM.h rename to include/modules/llamaForCausalLM_fp32.h index 76e968b..693140e 100644 --- a/include/modules/llamaForCausalLM.h +++ b/include/modules/llamaForCausalLM_fp32.h @@ -1,4 +1,4 @@ -// #include "llamaDecoder.h" + #include "llamaDecoder_fp32.h" // struct Fp32LlamaForCausalLM_output { // Matrix3D logits; @@ -31,3 +31,35 @@ // float* logits_output; // float* lm_head_weight; // }; + + +struct Int4LlamaForCausalLM_output { + Matrix3D logits; + std::vector> past_keys, past_values; +}; +struct Int4LlamaForCausalLM_input { + Matrix3D input_ids; + std::vector> past_keys, past_values; + bool has_past_keys_values; + + Int4LlamaForCausalLM_input() {} + Int4LlamaForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } + Int4LlamaForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, + std::vector> past_values_) + : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { + has_past_keys_values = true; + } +}; + +class Int4LlamaForCausalLM { + public: + Int4LlamaForCausalLM(std::string param_path, const struct model_config config); + struct Int4LlamaForCausalLM_output forward(const struct Int4LlamaForCausalLM_input& input); + + private: + // Int4llamaDecoder decoder; + // Linear_FP_int4 lm_head; + std::string profile_name = "Int4LlamaForCausalLM"; + float* logits_output; + uint8_t* lm_head_weight; +}; diff --git a/include/modules/llamaForCausalLM_int4.h b/include/modules/llamaForCausalLM_int4.h new file mode 100644 index 0000000..3f0a51f --- /dev/null +++ b/include/modules/llamaForCausalLM_int4.h @@ -0,0 +1,32 @@ +#include "llamaDecoder_int4.h" + +struct Int4LlamaForCausalLM_output { + Matrix3D logits; + std::vector> past_keys, past_values; +}; +struct Int4LlamaForCausalLM_input { + Matrix3D input_ids; + std::vector> past_keys, past_values; + bool has_past_keys_values; + + Int4LlamaForCausalLM_input() {} + Int4LlamaForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } + Int4LlamaForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, + std::vector> past_values_) + : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { + has_past_keys_values = true; + } +}; + +class Int4LlamaForCausalLM { + public: + Int4LlamaForCausalLM(std::string param_path, const struct model_config config); + struct Int4LlamaForCausalLM_output forward(const struct Int4LlamaForCausalLM_input& input); + + private: + Int4llamaDecoder decoder; + // Linear_FP_int4 lm_head; + std::string profile_name = "Int4LlamaForCausalLM"; + float* logits_output; + uint8_t* lm_head_weight; +}; \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 082be94..7b8fd48 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,7 +3,7 @@ #include "model.h" #include "utlis.h" #include "opt_params.h" -#include "llamaAttention.h" +#include "llamaForCausalLM_int4.h" std::map model_config = {{"OPT_125m", OPT_125M}, {"OPT_1.3B", OPT_1_3B}, {"OPT_6.7B", OPT_6_7B}, {"LLaMA_7B", LLaMA_7B}, {"LLaMA_7B_AWQ", LLaMA_7B}, {"LLaMA_7B_2_chat", LLaMA_7B}}; @@ -105,7 +105,7 @@ int main(int argc, char **argv) case FP32: { std::cout << m_path << std::endl; - Fp32llamaAttention a = Fp32llamaAttention(m_path, get_opt_model_config(model_id)); + // Fp32llamaAttention a = Fp32llamaAttention(m_path, get_opt_model_config(model_id)); // Fp32LlamaForCausalLM model = Fp32LlamaForCausalLM(m_path, get_opt_model_config(model_id)); std::cout << "Finished!" << std::endl; @@ -124,7 +124,7 @@ int main(int argc, char **argv) case INT4: { m_path = "../models/INT4/" + m_path; - Fp32llamaAttention a = Fp32llamaAttention(m_path, get_opt_model_config(model_id)); + Int4LlamaForCausalLM model = Int4LlamaForCausalLM(m_path, get_opt_model_config(model_id)); std::cout << "Finished!" << std::endl; // Get input from the user diff --git a/src/modules/llamaAttention.cpp b/src/modules/llamaAttention_fp32.cpp similarity index 98% rename from src/modules/llamaAttention.cpp rename to src/modules/llamaAttention_fp32.cpp index f6a3581..5ed73ec 100644 --- a/src/modules/llamaAttention.cpp +++ b/src/modules/llamaAttention_fp32.cpp @@ -1,4 +1,4 @@ -#include"llamaAttention.h" +#include"llamaAttention_fp32.h" diff --git a/src/modules/llamaAttention_int4.cpp b/src/modules/llamaAttention_int4.cpp new file mode 100644 index 0000000..fd79663 --- /dev/null +++ b/src/modules/llamaAttention_int4.cpp @@ -0,0 +1,14 @@ +#include "llamaAttention_int4.h" + +Int4llamaAttention::Int4llamaAttention(std::string param_path, const struct model_config config) +{ + std::cout << param_path << std::endl; + uint8_t *q_weight, *k_weight, *v_weight, *o_weight; + allocate_aligned_memory(q_weight, (config.embed_dim * config.embed_dim * sizeof(uint8_t)) / 2); + allocate_aligned_memory(k_weight, (config.embed_dim * config.embed_dim * sizeof(uint8_t)) / 2); + allocate_aligned_memory(v_weight, (config.embed_dim * config.embed_dim * sizeof(uint8_t)) / 2); + allocate_aligned_memory(o_weight, (config.embed_dim * config.embed_dim * sizeof(uint8_t)) / 2); + // this->q_proj = + // Linear_FP_int4(Matrix3D(q_weight, 1, config.embed_dim, config.embed_dim / 2), param_path + "/q_proj"); + std::cout << "Allocated memory" << std::endl; +} \ No newline at end of file diff --git a/src/modules/llamaDecoder.cpp b/src/modules/llamaDecoder_fp32.cpp similarity index 100% rename from src/modules/llamaDecoder.cpp rename to src/modules/llamaDecoder_fp32.cpp diff --git a/src/modules/llamaDecoder_int4.cpp b/src/modules/llamaDecoder_int4.cpp new file mode 100644 index 0000000..b4c922e --- /dev/null +++ b/src/modules/llamaDecoder_int4.cpp @@ -0,0 +1,19 @@ +#include "llamaDecoder_int4.h" + +Int4llamaDecoder::Int4llamaDecoder(std::string param_path, const struct model_config config) +{ + allocate_aligned_memory(attention_mask_buf, config.max_sqlen * config.max_sqlen * sizeof(float)); + allocate_aligned_memory(pos_embeds_buf, config.max_sqlen * config.embed_dim * sizeof(float)); + allocate_aligned_memory(last_hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float)); + allocate_aligned_memory(hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float)); + + for (int layer_idx = 0; layer_idx < config.num_layers; layer_idx++) { + DEBUG_INS(std::cout << "Start loading layer:" << layer_idx << "..." << std::endl;) + + std::string path = param_path + "/layer" + std::to_string(layer_idx); + Int4llamaDecoderLayer layer = Int4llamaDecoderLayer(path, config, layer_idx); + + this->layers.push_back(layer); + } + std::cout << "Int4llamaDecoder init finished!" << std::endl; +} \ No newline at end of file diff --git a/src/modules/llamaDecoderlayer.cpp b/src/modules/llamaDecoderlayer_fp32.cpp similarity index 100% rename from src/modules/llamaDecoderlayer.cpp rename to src/modules/llamaDecoderlayer_fp32.cpp diff --git a/src/modules/llamaDecoderlayer_int4.cpp b/src/modules/llamaDecoderlayer_int4.cpp new file mode 100644 index 0000000..8fa1b91 --- /dev/null +++ b/src/modules/llamaDecoderlayer_int4.cpp @@ -0,0 +1,8 @@ +#include "llamaDecoderlayer_int4.h" + + +Int4llamaDecoderLayer::Int4llamaDecoderLayer(std::string param_path, const struct model_config config, int layer_idx) { + + this->attn = Int4llamaAttention(param_path + "/self_attn", config); + std::cout << "Int4llamaDecoderLayer init finished! Layer index: " << layer_idx << std::endl; + } \ No newline at end of file diff --git a/src/modules/llamaForCausalLM.cpp b/src/modules/llamaForCausalLM_fp32.cpp similarity index 100% rename from src/modules/llamaForCausalLM.cpp rename to src/modules/llamaForCausalLM_fp32.cpp diff --git a/src/modules/llamaForCausalLM_int4.cpp b/src/modules/llamaForCausalLM_int4.cpp new file mode 100644 index 0000000..1e92380 --- /dev/null +++ b/src/modules/llamaForCausalLM_int4.cpp @@ -0,0 +1,11 @@ +#include "llamaForCausalLM_int4.h" + +Int4LlamaForCausalLM::Int4LlamaForCausalLM(std::string param_path, const struct model_config config) { + allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); + allocate_aligned_memory(lm_head_weight, (config.embed_dim * config.vocsize * sizeof(uint8_t)) / 2); + + this->decoder = Int4llamaDecoder(param_path + "/decoder", config); + // this->lm_head = Linear_FP_int4(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim / 2), + // param_path + "/lm_head"); + std::cout << "Int4LlamaForCausalLM init finished!" << std::endl; +} \ No newline at end of file