@@ -274,16 +274,18 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
274
274
GGML_METAL_ADD_KERNEL (mul_mv_q4_K_f32);
275
275
GGML_METAL_ADD_KERNEL (mul_mv_q5_K_f32);
276
276
GGML_METAL_ADD_KERNEL (mul_mv_q6_K_f32);
277
- GGML_METAL_ADD_KERNEL (mul_mm_f32_f32);
278
- GGML_METAL_ADD_KERNEL (mul_mm_f16_f32);
279
- GGML_METAL_ADD_KERNEL (mul_mm_q4_0_f32);
280
- GGML_METAL_ADD_KERNEL (mul_mm_q8_0_f32);
281
- GGML_METAL_ADD_KERNEL (mul_mm_q4_1_f32);
282
- GGML_METAL_ADD_KERNEL (mul_mm_q2_K_f32);
283
- GGML_METAL_ADD_KERNEL (mul_mm_q3_K_f32);
284
- GGML_METAL_ADD_KERNEL (mul_mm_q4_K_f32);
285
- GGML_METAL_ADD_KERNEL (mul_mm_q5_K_f32);
286
- GGML_METAL_ADD_KERNEL (mul_mm_q6_K_f32);
277
+ if ([ctx->device supportsFamily: MTLGPUFamilyApple7]) {
278
+ GGML_METAL_ADD_KERNEL (mul_mm_f32_f32);
279
+ GGML_METAL_ADD_KERNEL (mul_mm_f16_f32);
280
+ GGML_METAL_ADD_KERNEL (mul_mm_q4_0_f32);
281
+ GGML_METAL_ADD_KERNEL (mul_mm_q8_0_f32);
282
+ GGML_METAL_ADD_KERNEL (mul_mm_q4_1_f32);
283
+ GGML_METAL_ADD_KERNEL (mul_mm_q2_K_f32);
284
+ GGML_METAL_ADD_KERNEL (mul_mm_q3_K_f32);
285
+ GGML_METAL_ADD_KERNEL (mul_mm_q4_K_f32);
286
+ GGML_METAL_ADD_KERNEL (mul_mm_q5_K_f32);
287
+ GGML_METAL_ADD_KERNEL (mul_mm_q6_K_f32);
288
+ }
287
289
GGML_METAL_ADD_KERNEL (rope_f32);
288
290
GGML_METAL_ADD_KERNEL (rope_f16);
289
291
GGML_METAL_ADD_KERNEL (alibi_f32);
@@ -296,8 +298,22 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
296
298
#undef GGML_METAL_ADD_KERNEL
297
299
}
298
300
299
- GGML_METAL_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx->device .hasUnifiedMemory ? " true" : " false" );
300
301
#if TARGET_OS_OSX
302
+ // print MTL GPU family:
303
+ GGML_METAL_LOG_INFO (" %s : GPU name: %s \n " , __func__, [[ctx->device name ] UTF8String ]);
304
+ GGML_METAL_LOG_INFO (" %s : GPU arch: %s \n " , __func__, [[ctx->device architecture ].name UTF8String ]);
305
+
306
+ // determine max supported GPU family
307
+ // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
308
+ // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
309
+ for (int i = MTLGPUFamilyApple9 + 10 ; i >= MTLGPUFamilyApple1 ; --i) {
310
+ if ([ctx->device supportsFamily: i]) {
311
+ GGML_METAL_LOG_INFO (" %s : GPU family: MTLGPUFamilyApple%d (%d )\n " , __func__, i - MTLGPUFamilyApple1 + 1 , i);
312
+ break ;
313
+ }
314
+ }
315
+
316
+ GGML_METAL_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx->device .hasUnifiedMemory ? " true" : " false" );
301
317
GGML_METAL_LOG_INFO (" %s : recommendedMaxWorkingSetSize = %8.2f MB\n " , __func__, ctx->device .recommendedMaxWorkingSetSize / 1024.0 / 1024.0 );
302
318
if (ctx->device .maxTransferRate != 0 ) {
303
319
GGML_METAL_LOG_INFO (" %s : maxTransferRate = %8.2f MB/s\n " , __func__, ctx->device .maxTransferRate / 1024.0 / 1024.0 );
@@ -351,16 +367,18 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
351
367
GGML_METAL_DEL_KERNEL (mul_mv_q4_K_f32);
352
368
GGML_METAL_DEL_KERNEL (mul_mv_q5_K_f32);
353
369
GGML_METAL_DEL_KERNEL (mul_mv_q6_K_f32);
354
- GGML_METAL_DEL_KERNEL (mul_mm_f32_f32);
355
- GGML_METAL_DEL_KERNEL (mul_mm_f16_f32);
356
- GGML_METAL_DEL_KERNEL (mul_mm_q4_0_f32);
357
- GGML_METAL_DEL_KERNEL (mul_mm_q8_0_f32);
358
- GGML_METAL_DEL_KERNEL (mul_mm_q4_1_f32);
359
- GGML_METAL_DEL_KERNEL (mul_mm_q2_K_f32);
360
- GGML_METAL_DEL_KERNEL (mul_mm_q3_K_f32);
361
- GGML_METAL_DEL_KERNEL (mul_mm_q4_K_f32);
362
- GGML_METAL_DEL_KERNEL (mul_mm_q5_K_f32);
363
- GGML_METAL_DEL_KERNEL (mul_mm_q6_K_f32);
370
+ if ([ctx->device supportsFamily: MTLGPUFamilyApple7]) {
371
+ GGML_METAL_DEL_KERNEL (mul_mm_f32_f32);
372
+ GGML_METAL_DEL_KERNEL (mul_mm_f16_f32);
373
+ GGML_METAL_DEL_KERNEL (mul_mm_q4_0_f32);
374
+ GGML_METAL_DEL_KERNEL (mul_mm_q8_0_f32);
375
+ GGML_METAL_DEL_KERNEL (mul_mm_q4_1_f32);
376
+ GGML_METAL_DEL_KERNEL (mul_mm_q2_K_f32);
377
+ GGML_METAL_DEL_KERNEL (mul_mm_q3_K_f32);
378
+ GGML_METAL_DEL_KERNEL (mul_mm_q4_K_f32);
379
+ GGML_METAL_DEL_KERNEL (mul_mm_q5_K_f32);
380
+ GGML_METAL_DEL_KERNEL (mul_mm_q6_K_f32);
381
+ }
364
382
GGML_METAL_DEL_KERNEL (rope_f32);
365
383
GGML_METAL_DEL_KERNEL (rope_f16);
366
384
GGML_METAL_DEL_KERNEL (alibi_f32);
@@ -986,32 +1004,36 @@ void ggml_metal_graph_compute(
986
1004
} break ;
987
1005
case GGML_OP_MUL_MAT:
988
1006
{
989
- // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
990
-
991
1007
GGML_ASSERT (ne00 == ne10);
992
- // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
993
- uint gqa = ne12/ne02;
994
1008
GGML_ASSERT (ne03 == ne13);
995
1009
1010
+ const uint gqa = ne12/ne02;
1011
+
996
1012
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
997
- // to the matrix-vector kernel. the numbers below are measured on M2 Ultra
998
- // not sure if this translates across all chips
1013
+ // to the matrix-vector kernel
999
1014
int ne11_mm_min = 1 ;
1000
1015
1001
- switch (src0t) {
1002
- case GGML_TYPE_F16: ne11_mm_min = 2 ; break ;
1003
- case GGML_TYPE_Q8_0: ne11_mm_min = 7 ; break ;
1004
- case GGML_TYPE_Q2_K: ne11_mm_min = 15 ; break ;
1005
- case GGML_TYPE_Q3_K: ne11_mm_min = 7 ; break ;
1006
- case GGML_TYPE_Q4_0:
1007
- case GGML_TYPE_Q4_1: ne11_mm_min = 15 ; break ;
1008
- case GGML_TYPE_Q4_K: ne11_mm_min = 11 ; break ;
1009
- case GGML_TYPE_Q5_0: // not tested yet
1010
- case GGML_TYPE_Q5_1: ne11_mm_min = 13 ; break ; // not tested yet
1011
- case GGML_TYPE_Q5_K: ne11_mm_min = 7 ; break ;
1012
- case GGML_TYPE_Q6_K: ne11_mm_min = 7 ; break ;
1013
- default : ne11_mm_min = 1 ; break ;
1016
+ #if 0
1017
+ // the numbers below are measured on M2 Ultra for 7B and 13B models
1018
+ // these numbers do not translate to other devices or model sizes
1019
+ // TODO: need to find a better approach
1020
+ if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
1021
+ switch (src0t) {
1022
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
1023
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
1024
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
1025
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
1026
+ case GGML_TYPE_Q4_0:
1027
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
1028
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
1029
+ case GGML_TYPE_Q5_0: // not tested yet
1030
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
1031
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
1032
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
1033
+ default: ne11_mm_min = 1; break;
1034
+ }
1014
1035
}
1036
+ #endif
1015
1037
1016
1038
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
1017
1039
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
0 commit comments