Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump tqdm from 4.66.2 to 4.66.3 #21

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
506d2fe
llama : expose llama_load_model_from_file_gpt4all
cebtenzzre Nov 24, 2023
88289b5
kompute : fix ggml_vk_device leaks
cebtenzzre Jan 31, 2024
3f7c4b9
kompute : fix c++11 compatibility
cebtenzzre Jan 31, 2024
9d5207b
kompute : enable Pascal GPUs
cebtenzzre Jan 31, 2024
b6891bc
llama : wrap llama_new_context_with_model in try/catch
cebtenzzre Feb 1, 2024
b80287e
kompute : add missing call to ggml_backend_kompute_device_unref
cebtenzzre Feb 1, 2024
dc7a50b
kompute : fix ggml_vk_allocate failure control flow
cebtenzzre Feb 1, 2024
c5014f6
kompute : disable GPU offload for Mixtral
cebtenzzre Feb 5, 2024
c76f5c3
kompute : do not list Intel GPUs as they are unsupported (#14)
cebtenzzre Feb 12, 2024
6ff4387
kompute : make partial tensor copies faster by syncing less data (#15)
cebtenzzre Feb 13, 2024
12dcddc
kompute : disable LLAMA_SPLIT_LAYER after ggerganov/llama.cpp#5321
cebtenzzre Feb 21, 2024
82b50e5
kompute : add gemma, phi-2, qwen2, and stablelm to whitelist
cebtenzzre Feb 21, 2024
a76f5f4
kompute : enable GPU support for 10 more model architectures
cebtenzzre Feb 22, 2024
877851b
llama : fix -Wunused-const-variable warning for non-Kompute build
cebtenzzre Feb 22, 2024
729d661
llama : expose model name and architecture via API
cebtenzzre Mar 5, 2024
2b8cb26
kompute : put device with most VRAM first, not least
cebtenzzre May 1, 2024
6e0b5d9
vulkan : make ggml_vk_instance_init static
cebtenzzre Apr 30, 2024
aea0abe
vulkan : don't filter devices by default, don't abort if none
cebtenzzre Apr 30, 2024
535c7b1
vulkan : implement ggml_vk_available_devices
cebtenzzre Apr 30, 2024
2a91dbf
vulkan : guard against multiple initialization
cebtenzzre May 1, 2024
ad1ab57
rocm : symlink source files so CUDA can be built in the same project
cebtenzzre May 2, 2024
09058b1
cuda : implement ggml_cuda_available_devices
cebtenzzre May 6, 2024
b0ccbe1
kompute : update submodule for install fix
cebtenzzre May 8, 2024
74a41c6
kompute : fix leaks in ggml_vk_current_device
cebtenzzre May 13, 2024
f10326c
kompute : fix use-after-free in ggml_vk_get_device
cebtenzzre May 20, 2024
e5c0df7
llama : replace ngl=0 hack with llama_model_using_gpu
cebtenzzre Jun 4, 2024
159235e
llama : use the correct buffer type when we choose not to load on GPU
cebtenzzre Jul 10, 2024
c301b42
kompute : update for leak fixes, cleanup changes, shaderFloat16
cebtenzzre Jul 18, 2024
7d402b3
kompute : plug a few memory leaks
cebtenzzre Jul 18, 2024
48a830c
common : Kompute supports --main-gpu, do not warn
cebtenzzre Jul 18, 2024
6e0ad3c
kompute : fix dangling references in ggml_vk_graph_kompute
cebtenzzre Jul 18, 2024
c3d5264
kompute : avoid freeing device/instance until absolutely necessary
cebtenzzre Jul 18, 2024
561d0ce
kompute : update ggml_vk_supports_op to fix false pos/neg
cebtenzzre Jul 18, 2024
cd13f44
kompute : fix missing unref on allocation failure
cebtenzzre Jul 18, 2024
3d4c558
Bump tqdm from 4.66.2 to 4.66.3
dependabot[bot] Jul 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cuda : implement ggml_cuda_available_devices
  • Loading branch information
cebtenzzre committed Jul 18, 2024
commit 09058b1847d5a024cc7c56809cea3fd66b4d3dd6
13 changes: 12 additions & 1 deletion ggml/include/ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include "ggml.h"
#include "ggml-backend.h"

#include <stddef.h>

#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
Expand All @@ -11,11 +13,20 @@
#define GGML_CUBLAS_NAME "cuBLAS"
#endif

#define GGML_CUDA_MAX_DEVICES 16

#ifdef __cplusplus
extern "C" {
#endif

#define GGML_CUDA_MAX_DEVICES 16
struct ggml_cuda_device {
uint32_t index;
uint64_t heapSize;
const char * name;
};

GGML_API GGML_CALL struct ggml_cuda_device * ggml_cuda_available_devices(size_t * count);
GGML_API GGML_CALL void ggml_cuda_device_destroy(ggml_cuda_device * device);

// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
Expand Down
70 changes: 65 additions & 5 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,20 @@
#include <algorithm>
#include <array>
#include <atomic>
#include <cfloat>
#include <cinttypes>
#include <cstdarg>
#include <cstddef>
#include <cstdint>
#include <float.h>
#include <cstdio>
#include <cstdlib>
#include <limits>
#include <list>
#include <map>
#include <memory>
#include <mutex>
#include <stdint.h>
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string>
#include <unordered_map>
#include <vector>

static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
Expand Down Expand Up @@ -186,6 +187,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");

info.devices[id].total_vram = prop.totalGlobalMem;
auto &name_dst = info.devices[id].name;
strncpy(name_dst, prop.name, sizeof name_dst);
name_dst[sizeof name_dst - 1] = 0;

info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;

Expand Down Expand Up @@ -3068,3 +3074,57 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}

static std::list<ggml_cuda_device> ggml_cuda_available_devices_internal() {
std::list<ggml_cuda_device> results;

const auto & cuda_info = ggml_cuda_info();

std::unordered_map<std::string, size_t> count_by_name;

for (int dev_idx = 0; dev_idx < cuda_info.device_count; dev_idx++) {
const auto & device = cuda_info.devices[dev_idx];

std::string name(device.name);
size_t n_idx = ++count_by_name[name];
if (n_idx > 1) {
name += " (" + std::to_string(n_idx) + ")";
}

results.push_back({
/* index = */ uint32_t(dev_idx),
/* heapSize = */ uint64_t(device.total_vram),
/* name = */ strdup(name.c_str()),
});
}

// std::list::sort is guaranteed to be stable
results.sort(
[](const ggml_cuda_device & a, const ggml_cuda_device & b) -> bool {
return a.heapSize > b.heapSize; // descending
}
);

return results;
}

// public API returns a C-style array
ggml_cuda_device * ggml_cuda_available_devices(size_t * count) {
auto devices = ggml_cuda_available_devices_internal();
*count = devices.size();
if (devices.empty()) {
return nullptr;
}

size_t nbytes = sizeof(ggml_cuda_device) * devices.size();
auto * arr = static_cast<ggml_cuda_device *>(malloc(nbytes));

int i = 0;
for (auto & d : devices) { arr[i++] = d; }

return arr;
}

void ggml_cuda_device_destroy(ggml_cuda_device * device) {
free(const_cast<char *>(device->name));
}
1 change: 1 addition & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,7 @@ struct ggml_cuda_device_info {
bool vmm; // virtual memory support
size_t vmm_granularity; // granularity of virtual memory
size_t total_vram;
char name[256];
};

cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
Expand Down