10
10
11
11
#include " ggml.h"
12
12
#ifdef GGML_USE_CUBLAS
13
- #include " ggml-cuda.h "
13
+ #include < cuda_runtime.h >
14
14
#include < cufile.h>
15
+ #include " ggml-cuda.h"
15
16
#endif
16
17
17
18
#include < array>
@@ -784,7 +785,18 @@ struct llama_model_loader {
784
785
size_t offset = lt.shards .at (0 ).file_off ;
785
786
size_t actual_size;
786
787
void * buf = ggml_cuda_pool_malloc (lt.size , &actual_size);
787
- cuFileRead (file.cf_handle , buf, lt.size , offset, 0 );
788
+
789
+ if (file.cf_need_workaround ) { // load to host, then copy to device
790
+ void * buf_host = ggml_cuda_host_malloc (lt.size );
791
+ file.seek (offset, SEEK_SET);
792
+ file.read_raw (buf_host, lt.size );
793
+ cudaMemcpy (buf, buf_host, lt.size , cudaMemcpyHostToDevice);
794
+ cudaDeviceSynchronize ();
795
+ ggml_cuda_host_free (buf_host);
796
+ } else { // load directly to device
797
+ cuFileRead (file.cf_handle , buf, lt.size , offset, 0 );
798
+ }
799
+
788
800
lt.data = (uint8_t *) buf;
789
801
}
790
802
#endif // GGML_USE_CUBLAS
@@ -974,26 +986,6 @@ static void llama_model_load_internal(
974
986
ml->calc_sizes (&ctx_size, &mmapped_size);
975
987
fprintf (stderr, " %s: ggml ctx size = %6.2f KB\n " , __func__, ctx_size/1024.0 );
976
988
977
- // print memory requirements
978
- {
979
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1 ;
980
-
981
- // this is the total memory required to run the inference
982
- const size_t mem_required =
983
- ctx_size +
984
- mmapped_size +
985
- MEM_REQ_SCRATCH0 ().at (model.type ) +
986
- MEM_REQ_SCRATCH1 ().at (model.type ) +
987
- MEM_REQ_EVAL ().at (model.type );
988
-
989
- // this is the memory required by one llama_state
990
- const size_t mem_required_state =
991
- scale*MEM_REQ_KV_SELF ().at (model.type );
992
-
993
- fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
994
- mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
995
- }
996
-
997
989
// create the ggml context
998
990
{
999
991
lctx.model .buf .resize (ctx_size);
@@ -1015,6 +1007,7 @@ static void llama_model_load_internal(
1015
1007
}
1016
1008
1017
1009
// prepare memory for the weights
1010
+ size_t vram_total = 0 ;
1018
1011
{
1019
1012
const uint32_t n_embd = hparams.n_embd ;
1020
1013
const uint32_t n_layer = hparams.n_layer ;
@@ -1024,7 +1017,13 @@ static void llama_model_load_internal(
1024
1017
1025
1018
model.tok_embeddings = ml->get_tensor (" tok_embeddings.weight" , {n_embd, n_vocab}, GGML_BACKEND_CPU);
1026
1019
model.norm = ml->get_tensor (" norm.weight" , {n_embd}, GGML_BACKEND_CPU);
1027
- model.output = ml->get_tensor (" output.weight" , {n_embd, n_vocab}, GGML_BACKEND_CPU);
1020
+ ggml_backend backend_output;
1021
+ if (n_gpu_layers > int (n_layer)) {
1022
+ backend_output = GGML_BACKEND_CUDA;
1023
+ } else {
1024
+ backend_output = GGML_BACKEND_CPU;
1025
+ }
1026
+ model.output = ml->get_tensor (" output.weight" , {n_embd, n_vocab}, backend_output);
1028
1027
1029
1028
model.layers .resize (n_layer);
1030
1029
const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1046,51 +1045,56 @@ static void llama_model_load_internal(
1046
1045
layer.w1 = ml->get_tensor (layers_i + " .feed_forward.w1.weight" , {n_embd, n_ff}, backend);
1047
1046
layer.w2 = ml->get_tensor (layers_i + " .feed_forward.w2.weight" , { n_ff, n_embd}, backend);
1048
1047
layer.w3 = ml->get_tensor (layers_i + " .feed_forward.w3.weight" , {n_embd, n_ff}, backend);
1048
+ if (backend == GGML_BACKEND_CUDA) {
1049
+ vram_total += ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk )
1050
+ + ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm )
1051
+ + ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
1052
+ }
1049
1053
}
1050
1054
}
1051
1055
1052
1056
ml->done_getting_tensors ();
1053
1057
1054
- // populate `tensors_by_name`
1055
- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1056
- model.tensors_by_name .emplace_back (lt.name , lt.ggml_tensor );
1057
- }
1058
-
1059
- ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1060
-
1061
- model.mapping = std::move (ml->mapping );
1062
- #ifdef GGML_USE_CUBLAS
1058
+ // print memory requirements
1063
1059
{
1064
- // const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1065
- const int n_gpu = 0 ;
1060
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1 ;
1066
1061
1067
- fprintf (stderr, " %s: [cublas] offloading %d layers to GPU\n " , __func__, n_gpu);
1062
+ // this is the total memory required to run the inference
1063
+ const size_t mem_required =
1064
+ ctx_size +
1065
+ mmapped_size - vram_total + // weights in VRAM not in memory
1066
+ MEM_REQ_SCRATCH0 ().at (model.type ) +
1067
+ MEM_REQ_SCRATCH1 ().at (model.type ) +
1068
+ MEM_REQ_EVAL ().at (model.type );
1068
1069
1069
- size_t vram_total = 0 ;
1070
+ // this is the memory required by one llama_state
1071
+ const size_t mem_required_state =
1072
+ scale*MEM_REQ_KV_SELF ().at (model.type );
1070
1073
1071
- for ( int i = 0 ; i < n_gpu; ++i) {
1072
- const auto & layer = model. layers [i] ;
1074
+ fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state) \n " , __func__,
1075
+ mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 ) ;
1073
1076
1074
- ggml_cuda_transform_tensor (layer.attention_norm ); vram_total += ggml_nbytes (layer.attention_norm );
1075
- ggml_cuda_transform_tensor (layer.wq ); vram_total += ggml_nbytes (layer.wq );
1076
- ggml_cuda_transform_tensor (layer.wk ); vram_total += ggml_nbytes (layer.wk );
1077
- ggml_cuda_transform_tensor (layer.wv ); vram_total += ggml_nbytes (layer.wv );
1078
- ggml_cuda_transform_tensor (layer.wo ); vram_total += ggml_nbytes (layer.wo );
1079
- ggml_cuda_transform_tensor (layer.ffn_norm ); vram_total += ggml_nbytes (layer.ffn_norm );
1080
- ggml_cuda_transform_tensor (layer.w1 ); vram_total += ggml_nbytes (layer.w1 );
1081
- ggml_cuda_transform_tensor (layer.w2 ); vram_total += ggml_nbytes (layer.w2 );
1082
- ggml_cuda_transform_tensor (layer.w3 ); vram_total += ggml_nbytes (layer.w3 );
1083
- }
1077
+ #ifdef GGML_USE_CUBLAS
1078
+ const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1079
+
1080
+ fprintf (stderr, " %s: [cublas] offloading %d layers to GPU\n " , __func__, n_gpu);
1084
1081
if (n_gpu_layers > (int ) hparams.n_layer ) {
1085
1082
fprintf (stderr, " %s: [cublas] offloading output layer to GPU\n " , __func__);
1086
- ggml_cuda_transform_tensor (model.output ); vram_total += ggml_nbytes (model.output );
1087
1083
}
1088
-
1089
1084
fprintf (stderr, " %s: [cublas] total VRAM used: %zu MB\n " , __func__, vram_total / 1024 / 1024 );
1090
- }
1091
1085
#else
1092
1086
(void ) n_gpu_layers;
1093
1087
#endif
1088
+ }
1089
+
1090
+ // populate `tensors_by_name`
1091
+ for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1092
+ model.tensors_by_name .emplace_back (lt.name , lt.ggml_tensor );
1093
+ }
1094
+
1095
+ ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1096
+
1097
+ model.mapping = std::move (ml->mapping );
1094
1098
1095
1099
// loading time will be recalculate after the first eval, so
1096
1100
// we take page faults deferred by mmap() into consideration
0 commit comments