Skip to content

Commit 6b9554a

Browse files
committed
metal : print more GPU info + disable mul_mm for MTLGPUFamiliy < Apple7
1 parent 545b034 commit 6b9554a

File tree

2 files changed

+65
-42
lines changed

2 files changed

+65
-42
lines changed

ggml-metal.m

Lines changed: 62 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -274,16 +274,18 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
274274
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
275275
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
276276
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
277-
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
278-
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
279-
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
280-
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
281-
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
282-
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
283-
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
284-
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
285-
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
286-
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
277+
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
278+
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
279+
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
280+
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
281+
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
282+
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
283+
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
284+
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
285+
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
286+
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
287+
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
288+
}
287289
GGML_METAL_ADD_KERNEL(rope_f32);
288290
GGML_METAL_ADD_KERNEL(rope_f16);
289291
GGML_METAL_ADD_KERNEL(alibi_f32);
@@ -296,8 +298,22 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
296298
#undef GGML_METAL_ADD_KERNEL
297299
}
298300

299-
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
300301
#if TARGET_OS_OSX
302+
// print MTL GPU family:
303+
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
304+
GGML_METAL_LOG_INFO("%s: GPU arch: %s\n", __func__, [[ctx->device architecture].name UTF8String]);
305+
306+
// determine max supported GPU family
307+
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
308+
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
309+
for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) {
310+
if ([ctx->device supportsFamily:i]) {
311+
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
312+
break;
313+
}
314+
}
315+
316+
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
301317
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
302318
if (ctx->device.maxTransferRate != 0) {
303319
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
@@ -351,16 +367,18 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
351367
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
352368
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
353369
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
354-
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
355-
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
356-
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
357-
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
358-
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
359-
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
360-
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
361-
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
362-
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
363-
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
370+
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
371+
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
372+
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
373+
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
374+
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
375+
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
376+
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
377+
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
378+
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
379+
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
380+
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
381+
}
364382
GGML_METAL_DEL_KERNEL(rope_f32);
365383
GGML_METAL_DEL_KERNEL(rope_f16);
366384
GGML_METAL_DEL_KERNEL(alibi_f32);
@@ -986,32 +1004,36 @@ void ggml_metal_graph_compute(
9861004
} break;
9871005
case GGML_OP_MUL_MAT:
9881006
{
989-
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
990-
9911007
GGML_ASSERT(ne00 == ne10);
992-
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
993-
uint gqa = ne12/ne02;
9941008
GGML_ASSERT(ne03 == ne13);
9951009

1010+
const uint gqa = ne12/ne02;
1011+
9961012
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
997-
// to the matrix-vector kernel. the numbers below are measured on M2 Ultra
998-
// not sure if this translates across all chips
1013+
// to the matrix-vector kernel
9991014
int ne11_mm_min = 1;
10001015

1001-
switch (src0t) {
1002-
case GGML_TYPE_F16: ne11_mm_min = 2; break;
1003-
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
1004-
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
1005-
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
1006-
case GGML_TYPE_Q4_0:
1007-
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
1008-
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
1009-
case GGML_TYPE_Q5_0: // not tested yet
1010-
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
1011-
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
1012-
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
1013-
default: ne11_mm_min = 1; break;
1016+
#if 0
1017+
// the numbers below are measured on M2 Ultra for 7B and 13B models
1018+
// these numbers do not translate to other devices or model sizes
1019+
// TODO: need to find a better approach
1020+
if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
1021+
switch (src0t) {
1022+
case GGML_TYPE_F16: ne11_mm_min = 2; break;
1023+
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
1024+
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
1025+
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
1026+
case GGML_TYPE_Q4_0:
1027+
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
1028+
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
1029+
case GGML_TYPE_Q5_0: // not tested yet
1030+
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
1031+
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
1032+
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
1033+
default: ne11_mm_min = 1; break;
1034+
}
10141035
}
1036+
#endif
10151037

10161038
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
10171039
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel

ggml-metal.metal

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2332,7 +2332,7 @@ kernel void kernel_get_rows(
23322332
}
23332333

23342334
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
2335-
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
2335+
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
23362336
#define BLOCK_SIZE_K 32
23372337
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
23382338
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
@@ -2459,7 +2459,8 @@ kernel void kernel_mul_mm(device const uchar * src0,
24592459
}
24602460

24612461
threadgroup_barrier(mem_flags::mem_threadgroup);
2462-
device float * C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
2462+
2463+
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
24632464
if (sgitg == 0) {
24642465
for (int i = 0; i < n_rows; i++) {
24652466
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {

0 commit comments

Comments
 (0)