ggml : sync with llama.cpp

- int64_t number of elements - remove mlock - expose quantization functions - expose ggml_object - add ggml_view_3d() - multi-thread ggml_rope() - fix ggml_cpy() - add ggml_init_params.no_alloc - fix ggml_mul_mat() backward
kelenam · Apr 10, 2023 · 3ac8072 · 3ac8072
1 parent 42cbb07
commit 3ac8072
Show file tree

Hide file tree

Showing 18 changed files with 1,086 additions and 716 deletions.
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
@@ -199,6 +199,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
         struct ggml_init_params params = {
             .mem_size   = ctx_size,
             .mem_buffer = NULL,
+            .no_alloc   = false,
         };
 
         model.ctx = ggml_init(params);
@@ -315,9 +316,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             }
 
             int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
+            int64_t ne[2] = { 1, 1 };
             for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                int32_t ne_cur;
+                fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
+                ne[i] = ne_cur;
                 nelements *= ne[i];
             }
 
@@ -336,14 +339,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             }
 
             if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
                         __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
                 return false;
             }
 
             if (0) {
                 static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+                printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
             }
 
             size_t bpe = 0;
@@ -432,6 +435,7 @@ bool gpt2_eval(
     struct ggml_init_params params = {
         .mem_size   = buf_size,
         .mem_buffer = buf,
+        .no_alloc   = false,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
@@ -291,7 +291,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL };
+        struct ggml_init_params params = { 0, NULL, false };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }

diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
@@ -198,6 +198,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
         struct ggml_init_params params = {
             .mem_size   = ctx_size,
             .mem_buffer = NULL,
+            .no_alloc   = false,
         };
 
         model.ctx = ggml_init(params);
@@ -310,10 +311,12 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
                 break;
             }
 
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
+            int64_t nelements = 1;
+            int64_t ne[2] = { 1, 1 };
             for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                int32_t ne_cur;
+                fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
+                ne[i] = ne_cur;
                 nelements *= ne[i];
             }
 
@@ -332,14 +335,14 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
             }
 
             if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
                         __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
                 return false;
             }
 
             if (0) {
                 static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+                printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
             }
 
             size_t bpe = 0;
@@ -357,7 +360,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
             };
 
             if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n",
                         __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                 return false;
             }
@@ -431,6 +434,7 @@ bool gptj_eval(
     struct ggml_init_params params = {
         .mem_size   = buf_size,
         .mem_buffer = buf,
+        .no_alloc   = false,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
@@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL };
+        struct ggml_init_params params = { 0, NULL, false };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }

diff --git a/examples/whisper/main.cpp b/examples/whisper/main.cpp
@@ -8,6 +8,7 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include <cstring>
 
 // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 // Lowest is red, middle is yellow, highest is green.
@@ -371,6 +372,39 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
     return true;
 }
 
+char *escape_double_quotes(const char *str) {
+    if (str == NULL) {
+        return NULL;
+    }
+
+    size_t escaped_length = strlen(str) + 1;
+
+    for (size_t i = 0; str[i] != '\0'; i++) {
+        if (str[i] == '"') {
+            escaped_length++;
+        }
+    }
+
+    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
+    if (escaped == NULL) {
+        return NULL;
+    }
+
+    size_t pos = 0;
+    for (size_t i = 0; str[i] != '\0'; i++) {
+        if (str[i] == '"') {
+            escaped[pos++] = '\\';
+            escaped[pos++] = '"';
+        } else {
+            escaped[pos++] = str[i];
+        }
+    }
+
+    // no need to set zero due to calloc() being used prior
+
+    return escaped;
+}
+
 bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
     std::ofstream fout(fname);
     int indent = 0;
@@ -414,7 +448,9 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
 
     auto value_s = [&](const char *name, const char *val, bool end = false) {
         start_value(name);
-        fout << "\"" << val << (end ? "\"\n" : "\",\n");
+        char * val_escaped = escape_double_quotes(val);
+        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
+        free(val_escaped);
     };
 
     auto end_value = [&](bool end = false) {
@@ -455,7 +491,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                 value_i("ctx", whisper_model_n_text_ctx(ctx));
                 value_i("state", whisper_model_n_text_state(ctx));
                 value_i("head", whisper_model_n_text_head(ctx));
-                value_i("leyer", whisper_model_n_text_layer(ctx), true);
+                value_i("layer", whisper_model_n_text_layer(ctx), true);
             end_obj();
             value_i("mels", whisper_model_n_mels(ctx));
             value_i("f16", whisper_model_f16(ctx), true);
@@ -477,7 +513,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                 const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
 
                 start_obj();
-                    start_obj("timestanps");
+                    start_obj("timestamps");
                         value_s("from", to_timestamp(t0, true).c_str());
                         value_s("to", to_timestamp(t1, true).c_str(), true);
                     end_obj();
@@ -639,22 +675,6 @@ int main(int argc, char ** argv) {
         return 3;
     }
 
-    // initial prompt
-    std::vector<whisper_token> prompt_tokens;
-
-    if (!params.prompt.empty()) {
-        prompt_tokens.resize(1024);
-        prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
-
-        fprintf(stderr, "\n");
-        fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
-        fprintf(stderr, "initial tokens: [ ");
-        for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
-            fprintf(stderr, "%d ", prompt_tokens[i]);
-        }
-        fprintf(stderr, "]\n");
-    }
-
     for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
         const auto fname_inp = params.fname_inp[f];
 		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@@ -718,8 +738,7 @@ int main(int argc, char ** argv) {
 
             wparams.speed_up         = params.speed_up;
 
-            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
+            wparams.initial_prompt   = params.prompt.c_str();
 
             wparams.greedy.best_of        = params.best_of;
             wparams.beam_search.beam_size = params.beam_size;

diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp
@@ -334,7 +334,7 @@ int main(int argc, char ** argv) {
 
     // needed to initialize f16 tables
     {
-        struct ggml_init_params params = { 0, NULL };
+        struct ggml_init_params params = { 0, NULL, false };
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }