Skip to content

Commit 9acc42f

Browse files
fixup! Works on EXT4/NTFS
1 parent 56f2ed1 commit 9acc42f

File tree

2 files changed

+63
-52
lines changed

2 files changed

+63
-52
lines changed

llama-util.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ struct llama_file {
7979
size_t size;
8080
#ifdef GGML_USE_CUBLAS
8181
CUfileHandle_t cf_handle;
82+
bool cf_need_workaround = false;
8283
#endif // GGML_USE_CUBLAS
8384

8485
llama_file(const char * fname, const char * mode) {
@@ -98,7 +99,13 @@ struct llama_file {
9899
cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
99100

100101
CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
101-
CUFILE_CHECK(status);
102+
if (status.err == CU_FILE_INTERNAL_ERROR) {
103+
fprintf(stderr, "WARNING: cuFile experienced an internal error while loading weights from \"%s\". Using a workaround (slower). "
104+
"This happens with weight files on Btrfs partitions. ext4 and NTFS are confirmed to work.", fname);
105+
cf_need_workaround = true;
106+
} else {
107+
CUFILE_CHECK(status);
108+
}
102109
#endif // GGML_USE_CUBLAS
103110
}
104111

llama.cpp

Lines changed: 55 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@
1010

1111
#include "ggml.h"
1212
#ifdef GGML_USE_CUBLAS
13-
#include "ggml-cuda.h"
13+
#include <cuda_runtime.h>
1414
#include <cufile.h>
15+
#include "ggml-cuda.h"
1516
#endif
1617

1718
#include <array>
@@ -784,7 +785,18 @@ struct llama_model_loader {
784785
size_t offset = lt.shards.at(0).file_off;
785786
size_t actual_size;
786787
void * buf = ggml_cuda_pool_malloc(lt.size, &actual_size);
787-
cuFileRead(file.cf_handle, buf, lt.size, offset, 0);
788+
789+
if (file.cf_need_workaround) { // load to host, then copy to device
790+
void * buf_host = ggml_cuda_host_malloc(lt.size);
791+
file.seek(offset, SEEK_SET);
792+
file.read_raw(buf_host, lt.size);
793+
cudaMemcpy(buf, buf_host, lt.size, cudaMemcpyHostToDevice);
794+
cudaDeviceSynchronize();
795+
ggml_cuda_host_free(buf_host);
796+
} else { // load directly to device
797+
cuFileRead(file.cf_handle, buf, lt.size, offset, 0);
798+
}
799+
788800
lt.data = (uint8_t *) buf;
789801
}
790802
#endif // GGML_USE_CUBLAS
@@ -974,26 +986,6 @@ static void llama_model_load_internal(
974986
ml->calc_sizes(&ctx_size, &mmapped_size);
975987
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
976988

977-
// print memory requirements
978-
{
979-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
980-
981-
// this is the total memory required to run the inference
982-
const size_t mem_required =
983-
ctx_size +
984-
mmapped_size +
985-
MEM_REQ_SCRATCH0().at(model.type) +
986-
MEM_REQ_SCRATCH1().at(model.type) +
987-
MEM_REQ_EVAL().at(model.type);
988-
989-
// this is the memory required by one llama_state
990-
const size_t mem_required_state =
991-
scale*MEM_REQ_KV_SELF().at(model.type);
992-
993-
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
994-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
995-
}
996-
997989
// create the ggml context
998990
{
999991
lctx.model.buf.resize(ctx_size);
@@ -1015,6 +1007,7 @@ static void llama_model_load_internal(
10151007
}
10161008

10171009
// prepare memory for the weights
1010+
size_t vram_total = 0;
10181011
{
10191012
const uint32_t n_embd = hparams.n_embd;
10201013
const uint32_t n_layer = hparams.n_layer;
@@ -1024,7 +1017,13 @@ static void llama_model_load_internal(
10241017

10251018
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
10261019
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1027-
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1020+
ggml_backend backend_output;
1021+
if (n_gpu_layers > int(n_layer)) {
1022+
backend_output = GGML_BACKEND_CUDA;
1023+
} else {
1024+
backend_output = GGML_BACKEND_CPU;
1025+
}
1026+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
10281027

10291028
model.layers.resize(n_layer);
10301029
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1046,51 +1045,56 @@ static void llama_model_load_internal(
10461045
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
10471046
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
10481047
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
1048+
if (backend == GGML_BACKEND_CUDA) {
1049+
vram_total += ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)
1050+
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm)
1051+
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
1052+
}
10491053
}
10501054
}
10511055

10521056
ml->done_getting_tensors();
10531057

1054-
// populate `tensors_by_name`
1055-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1056-
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1057-
}
1058-
1059-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1060-
1061-
model.mapping = std::move(ml->mapping);
1062-
#ifdef GGML_USE_CUBLAS
1058+
// print memory requirements
10631059
{
1064-
// const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1065-
const int n_gpu = 0;
1060+
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
10661061

1067-
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
1062+
// this is the total memory required to run the inference
1063+
const size_t mem_required =
1064+
ctx_size +
1065+
mmapped_size - vram_total + // weights in VRAM not in memory
1066+
MEM_REQ_SCRATCH0().at(model.type) +
1067+
MEM_REQ_SCRATCH1().at(model.type) +
1068+
MEM_REQ_EVAL().at(model.type);
10681069

1069-
size_t vram_total = 0;
1070+
// this is the memory required by one llama_state
1071+
const size_t mem_required_state =
1072+
scale*MEM_REQ_KV_SELF().at(model.type);
10701073

1071-
for (int i = 0; i < n_gpu; ++i) {
1072-
const auto & layer = model.layers[i];
1074+
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
1075+
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
10731076

1074-
ggml_cuda_transform_tensor(layer.attention_norm); vram_total += ggml_nbytes(layer.attention_norm);
1075-
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
1076-
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
1077-
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
1078-
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
1079-
ggml_cuda_transform_tensor(layer.ffn_norm); vram_total += ggml_nbytes(layer.ffn_norm);
1080-
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
1081-
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
1082-
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
1083-
}
1077+
#ifdef GGML_USE_CUBLAS
1078+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1079+
1080+
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
10841081
if (n_gpu_layers > (int) hparams.n_layer) {
10851082
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
1086-
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
10871083
}
1088-
10891084
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
1090-
}
10911085
#else
10921086
(void) n_gpu_layers;
10931087
#endif
1088+
}
1089+
1090+
// populate `tensors_by_name`
1091+
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1092+
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
1093+
}
1094+
1095+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1096+
1097+
model.mapping = std::move(ml->mapping);
10941098

10951099
// loading time will be recalculate after the first eval, so
10961100
// we take page faults deferred by mmap() into consideration

0 commit comments

Comments
 (0)