Merge branch 'ztxz16:master' into master

xueminghui · Jun 14, 2023 · 20c36bd · 20c36bd
2 parents e356a23 + 4d99a41
commit 20c36bd
Show file tree

Hide file tree

Showing 21 changed files with 3,437 additions and 1,732 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,13 +19,14 @@ endif()
 include_directories(include)
 
 message(STATUS "CMAKE_CXX_FLAGS" ${CMAKE_CXX_FLAGS})
-set(FASTLLM_CXX_SOURCES src/fastllm.cpp src/chatglm.cpp src/moss.cpp src/vicuna.cpp)
+set(FASTLLM_CXX_SOURCES src/fastllm.cpp src/device.cpp src/devices/cpu/cpudevice.cpp src/executor.cpp src/chatglm.cpp src/moss.cpp src/vicuna.cpp)
 
 if (USE_CUDA)
     enable_language(CUDA)
     add_compile_definitions(USE_CUDA)
-    set(FASTLLM_CUDA_SOURCES src/fastllm-cuda.cu)
+    set(FASTLLM_CUDA_SOURCES src/fastllm-cuda.cu src/devices/cuda/cudadevice.cpp)
     set(FASTLLM_LINKED_LIBS ${FASTLLM_LINKED_LIBS} cublas)
+    #set(CMAKE_CUDA_ARCHITECTURES "70")
 endif()
 
 add_library(fastllm OBJECT
@@ -41,4 +42,7 @@ add_executable(quant tools/quant.cpp)
 target_link_libraries(quant fastllm)
 
 add_executable(webui example/webui/webui.cpp)
-target_link_libraries(webui fastllm)
+target_link_libraries(webui fastllm)
+
+add_executable(benchmark example/benchmark/benchmark.cpp)
+target_link_libraries(benchmark fastllm)
diff --git a/example/benchmark/benchmark.cpp b/example/benchmark/benchmark.cpp
@@ -0,0 +1,171 @@
+//
+// Created by huangyuyang on 6/9/23.
+//
+
+#include "factoryllm.h"
+#include "utils.h"
+#include "fstream"
+
+static factoryllm fllm;
+static int modeltype = 0;
+static char* modelpath = NULL;
+static fastllm::basellm* chatGlm = fllm.createllm(LLM_TYPE_CHATGLM);
+static fastllm::basellm* moss = fllm.createllm(LLM_TYPE_MOSS);
+static fastllm::basellm* vicuna = fllm.createllm(LLM_TYPE_VICUNA);
+static int sRound = 0;
+static std::string history;
+
+std::map <std::string, int> modelDict = {
+        {"chatglm", 0}, {"moss", 1}, {"vicuna", 2}
+};
+
+struct BenchmarkConfig {
+    int model = LLM_TYPE_CHATGLM; // 模型类型, 0 chatglm,1 moss,2 vicuna
+    std::string path = "chatglm-6b-int4.bin"; // 模型文件路径
+    int threads = 4; // 使用的线程数
+    int limit = -1; // 输出token数限制，如果 < 0 则代表无限制
+    int batch = -1; // batch数, -1时使用文件中的行数作为batch
+    std::string file; // 输入文件
+    std::string output; // 输出文件，如果不设定则输出到屏幕
+};
+
+void Usage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "[-h|--help]:                  显示帮助" << std::endl;
+    std::cout << "<-m|--model> <args>:          模型类型，默认为0, 可以设置为0(chatglm),1(moss),2(vicuna)" << std::endl;
+    std::cout << "<-p|--path> <args>:           模型文件的路径" << std::endl;
+    std::cout << "<-t|--threads> <args>:        使用的线程数量" << std::endl;
+    std::cout << "<-l|--limit> <args>:          输出token数限制" << std::endl;
+    std::cout << "<-b|--batch> <args>:          batch数"      << std::endl;
+    std::cout << "<-f|--file> <args>:           输入文件，文件中每行一个prompt，如果行数不足batch则用之前的prompt补充"      << std::endl;
+}
+
+void ParseArgs(int argc, char **argv, BenchmarkConfig &config) {
+    std::vector <std::string> sargv;
+    for (int i = 0; i < argc; i++) {
+        sargv.push_back(std::string(argv[i]));
+    }
+    for (int i = 1; i < argc; i++) {
+        if (sargv[i] == "-h" || sargv[i] == "--help") {
+            Usage();
+            exit(0);
+        }
+        else if (sargv[i] == "-m" || sargv[i] == "--model") {
+            if (modelDict.find(sargv[i + 1]) != modelDict.end()) {
+                config.model = modelDict[sargv[++i]];
+            } else {
+                config.model = atoi(sargv[++i].c_str());
+            }
+        }
+        else if (sargv[i] == "-p" || sargv[i] == "--path") {
+            config.path = sargv[++i];
+        }
+        else if (sargv[i] == "-t" || sargv[i] == "--threads") {
+            config.threads = atoi(sargv[++i].c_str());
+        } else if (sargv[i] == "-l" || sargv[i] == "--limit") {
+            config.limit = atoi(sargv[++i].c_str());
+        } else if (sargv[i] == "-b" || sargv[i] == "--batch") {
+            config.batch = atoi(sargv[++i].c_str());
+        } else if (sargv[i] == "-f" || sargv[i] == "--file") {
+            config.file = sargv[++i];
+        } else if (sargv[i] == "-o" || sargv[i] == "--output") {
+            config.output = sargv[++i];
+        } else {
+            Usage();
+            exit(-1);
+        }
+    }
+}
+
+int initLLMConf(int model, const char* modelPath, int threads) {
+    fastllm::SetThreads(threads);
+    modeltype = model;
+    //printf("@@init llm:type:%d,path:%s\n", model, modelPath);
+    if (modeltype == 0) {
+        chatGlm->LoadFromFile(modelPath);
+        chatGlm->WarmUp();
+    }
+    if (modeltype == 1) {
+        moss->LoadFromFile(modelPath);
+    }
+    if (modeltype == 2) {
+        vicuna->LoadFromFile(modelPath);
+    }
+    return 0;
+}
+
+
+void uninitLLM()
+{
+    if (chatGlm)
+    {
+        delete chatGlm;
+        chatGlm = NULL;
+    }
+    if (moss)
+    {
+        delete moss;
+        moss = NULL;
+    }
+    if (vicuna) {
+        delete vicuna;
+        vicuna = NULL;
+    }
+}
+
+int main(int argc, char **argv) {
+    BenchmarkConfig config;
+    ParseArgs(argc, argv, config);
+    initLLMConf(config.model, config.path.c_str(), config.threads);
+    chatGlm->output_token_limit = config.limit;
+
+    std::vector <std::string> inputs;
+    if (config.file != "") {
+        std::ifstream finputs(config.file, std::ios::in);
+        while (true) {
+            std::string input = "";
+            std::getline(finputs, input);
+            if (input == "") {
+                break;
+            } else {
+                inputs.push_back(input);
+            }
+        }
+    } else {
+        inputs.push_back("Hello！");
+    }
+    while (inputs.size() < config.batch) {
+        inputs.push_back(inputs[rand() % inputs.size()]);
+    }
+    if (inputs.size() > config.batch && config.batch != -1) {
+        inputs.resize(config.batch);
+    }
+
+    std::vector <std::string> outputs;
+    static int tokens = 0;
+    auto st = std::chrono::system_clock::now();
+    chatGlm->ResponseBatch(inputs, outputs, [](int index, std::vector <std::string> &contents) {
+        if (index != -1) {
+            for (int i = 0; i < contents.size(); i++) {
+                tokens += (contents[i].size() > 0);
+            }
+        }
+    });
+    float spend = fastllm::GetSpan(st, std::chrono::system_clock::now());
+
+    if (config.output != "") {
+        FILE *fo = fopen(config.output.c_str(), "w");
+        for (int i = 0; i < outputs.size(); i++) {
+            fprintf(fo, "[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
+        }
+        fclose(fo);
+    } else {
+        for (int i = 0; i < outputs.size(); i++) {
+            printf("[ user: \"%s\", model: \"%s\"]\n", inputs[i].c_str(), outputs[i].c_str());
+        }
+    }
+
+    printf("batch: %d\n", (int)inputs.size());
+    printf("output %d tokens\nuse %f s\nspeed = %f tokens / s\n", tokens, spend, tokens / spend);
+    return 0;
+}
diff --git a/example/benchmark/prompts.txt b/example/benchmark/prompts.txt
@@ -0,0 +1 @@
+Hello
diff --git a/include/basellm.h b/include/basellm.h
@@ -2,7 +2,7 @@
 #include "fastllm.h"
 
 typedef void(*RuntimeResult) (int index, const char* content); //实时生成的内容回调 index: 0开始回复，-1本次回复结束
-typedef void(*RuntimeResultBatch) (int index, std::vector <const char*> contents); //实时生成的内容回调 index: 0开始回复，-1本次回复结束
+typedef void(*RuntimeResultBatch) (int index, std::vector <std::string> &contents); //实时生成的内容回调 index: 0开始回复，-1本次回复结束
 
 namespace fastllm {
     class basellm {
@@ -22,7 +22,7 @@ namespace fastllm {
 
         virtual void ResponseBatch(const std::vector <std::string> &inputs,
                                    std::vector <std::string> &outputs,
-                                   RuntimeResult retCb) {} // 批量根据给出的内容回复
+                                   RuntimeResultBatch retCb = nullptr) {} // 批量根据给出的内容回复
 
         virtual void SaveLowBitModel(const std::string &fileName, int bit) {}; // 存储成量化模型
 
@@ -32,6 +32,8 @@ namespace fastllm {
 
         virtual void CausalMask(Data &data, int start) {}; // 因果mask
 
+        int output_token_limit = -1;
+
         int embed_dim = 4096;
         int num_attention_heads = 32;
         int head_dim = embed_dim / num_attention_heads;

diff --git a/include/chatglm.h b/include/chatglm.h
@@ -24,18 +24,23 @@ namespace fastllm {
                 const Data &positionIds,
                 std::vector <std::pair <Data, Data> > &pastKeyValues);
 
+        std::vector <int> ForwardBatch(
+                int batch,
+                const Data &inputIds,
+                const Data &attentionMask,
+                const Data &positionIds,
+                std::vector <std::pair <Data, Data> > &pastKeyValues);
+
 		virtual std::string Response(const std::string& input, RuntimeResult retCb); // 根据给出的内容回复
 
         virtual void ResponseBatch(const std::vector <std::string> &inputs,
                                    std::vector <std::string> &outputs,
-                                   RuntimeResult retCb);
+                                   RuntimeResultBatch retCb);
 
 		virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型
 
 		virtual void WarmUp(); // 预热
     private:
-		virtual void RotatePosition2D(Data &data, const Data &positionIds); // 二维位置编码
-
 		virtual void CausalMask(Data &data, int start) {}; // 因果mask？
     };
 }

diff --git a/include/device.h b/include/device.h
@@ -0,0 +1,56 @@
+//
+// Created by huangyuyang on 6/13/23.
+//
+
+#ifndef FASTLLM_DEVICE_H
+#define FASTLLM_DEVICE_H
+
+#include "fastllm.h"
+
+namespace fastllm {
+    typedef std::map <std::string, Data*> DataDict;
+    typedef std::map <std::string, float> FloatDict;
+    typedef std::map <std::string, int> IntDict;
+
+    class BaseOperator {
+    public:
+        // 是否可以运行某一个算子
+        virtual bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+
+        // 对某一个算子进行形状推理
+        virtual void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+
+        // 对某一个算子进行推理
+        virtual void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams) = 0;
+    };
+
+    class BaseDevice {
+    public:
+        virtual bool Malloc (void **ret, size_t size) = 0; // 分配尺寸为size的空间
+        virtual bool Malloc (void **ret, Data &data); // 分配形状为dims的空间
+        virtual bool Free(void *ret) = 0; // 释放ret
+
+        virtual bool CopyDataToCPU(void *dst, void *src, size_t size) = 0; // device上的src拷贝到cpu上的dst
+        virtual bool CopyDataToCPU(Data &data); // data数据从该device移动到CPU
+
+        virtual bool CopyDataFromCPU(void *dst, void *src, size_t size) = 0; // cpu上的src拷贝到device上的dst
+        virtual bool CopyDataFromCPU(Data &data); // data数据从CPU移动到该device
+
+        // 是否可以运行某一个算子
+        virtual bool CanRun(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+
+        // 对某一个算子进行形状推理
+        virtual void Reshape(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+
+        // 对某一个算子进行推理
+        virtual void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+
+        std::string deviceType;
+        std::string deviceName;
+        std::vector <int> deviceIds;
+
+        std::map <std::string, BaseOperator*> ops;
+    };
+}
+
+#endif //FASTLLM_DEVICE_H