-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Adding AMDGPU dialect wrapper for ROCDL transpose loads. #145395
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2009ede
50d19a6
087046a
fa30258
4259f63
c8157f0
bbb57ea
60e2c56
9bba79f
b5b4e6f
207f2f4
db9b837
32f0edf
c13aec2
94f73d5
1f03a6d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -898,6 +898,40 @@ def AMDGPU_GatherToLDSOp : | |
let hasVerifier = 1; | ||
} | ||
|
||
def AMDGPU_TransposeLoadOp : | ||
AMDGPU_Op<"transpose_load", [SameVariadicOperandSize]>, | ||
Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src, Variadic<Index>:$srcIndices)>, | ||
Results<(outs AnyTypeOf<[AnyVectorOfNonZeroRank]>:$result)> { | ||
let summary = "MLIR wrapper for CDNA Transpose Load instructions"; | ||
let description = [{ | ||
The `amdgpu.transpose_load` op is a wrapper around the `ds_read_tr` instructions. | ||
The transpose load op represents a subgroup load from LDS memory, | ||
where the subgroup of threads collectively reads a matrix from the source | ||
memref, with each thread reading a vector of the matrix, and gets a transposed matrix | ||
in as the result. That is, each thread reads a vector of the col-major matrix at different | ||
indices, and the thread's read result is a vector of the corresponding row of the transposed | ||
matrix. | ||
|
||
This op is a direct wrapper around the ROCDL `ds_read_tr` family intrinsics. Please refer | ||
to the CDNA4 ISA documentation for more details about its exact semantics. | ||
|
||
Format example: | ||
``` | ||
%0 = amdgpu.transpose_load %src[%srcIndices] : memref<128x256xf16> -> vector<4xf16> | ||
``` | ||
Operands: | ||
* `$src`: LDS memref to read from. | ||
* `$srcIndices`: indices into `$src` to read from for this thread. | ||
* `$result`: target register this transpose load instruction will write to. | ||
|
||
Note: Lowering is only supported on gfx950 and up. | ||
}]; | ||
let assemblyFormat = [{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know other ops here don't provide examples, but I think it would be worth adding going forward -- I rely on these all the time There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like your idea. So I tried to add a very simple example to show the format of the op. In terms of the semantics of the instruction, it is too hard to explain in a few sentences so I wrote that "please refer to the actual document for detailed explanation". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably call out that you mean the CDNA4 ISA manual |
||
$src `[` $srcIndices `]` attr-dict `:` type($src) `->` type($result) | ||
}]; | ||
let hasVerifier = 1; | ||
} | ||
|
||
def AMDGPU_ScaledMFMAOp : | ||
AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>, | ||
Pure]>, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
// RUN: mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s | ||
// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx945 2>&1 | FileCheck %s --check-prefix=CHECK-OLD | ||
|
||
// CHECK-LABEL: func @transpose_load_to_rocdl_4xf16 | ||
func.func @transpose_load_to_rocdl_4xf16(%idx1 : index, %idx2 : index, %wgmem : memref<128x72xf16, 3>) -> vector<4xf16> { | ||
// CHECK: rocdl.ds.read.tr16.b64 | ||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported | ||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x72xf16, 3> -> vector<4xf16> | ||
return %0 : vector<4xf16> | ||
} | ||
|
||
// ----- | ||
|
||
// CHECK-LABEL: func @transpose_load_to_rocdl_8xi8 | ||
func.func @transpose_load_to_rocdl_8xi8(%idx1 : index, %idx2 : index, %wgmem : memref<128x128xi8, 3>) -> vector<8xi8> { | ||
// CHECK: %[[RES:.*]] = rocdl.ds.read.tr8.b64 | ||
// CHECK-SAME: -> vector<2xi32> | ||
// CHECK-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<8xi8> | ||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported | ||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x128xi8, 3> -> vector<8xi8> | ||
return %0 : vector<8xi8> | ||
} | ||
|
||
// ----- | ||
|
||
// CHECK-LABEL: func @transpose_load_to_rocdl_i4_memrefxi8 | ||
func.func @transpose_load_to_rocdl_i4_memrefxi8(%idx1 : index, %idx2 : index, %wgmem : memref<128x32xi8, 3>) -> vector<16xi4> { | ||
// CHECK: %[[RES:.*]] = rocdl.ds.read.tr4.b64 | ||
// CHECK-SAME: -> vector<2xi32> | ||
// CHECK-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<16xi4> | ||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported | ||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x32xi8, 3> -> vector<16xi4> | ||
return %0 : vector<16xi4> | ||
} | ||
|
||
// ----- | ||
|
||
// CHECK-LABEL: func @transpose_load_to_rocdl_i6_memrefxi8 | ||
func.func @transpose_load_to_rocdl_i6_memrefxi8(%idx1 : index, %idx2 : index, %wgmem : memref<128x32xi8, 3>) -> vector<16xi6> { | ||
// CHECK: %[[RES:.*]] = rocdl.ds.read.tr6.b96 | ||
// CHECK-SAME: -> vector<3xi32> | ||
// CHECK-NEXT: llvm.bitcast %[[RES]] : vector<3xi32> to vector<16xi6> | ||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported | ||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x32xi8, 3> -> vector<16xi6> | ||
return %0 : vector<16xi6> | ||
} | ||
|
||
// ----- | ||
|
||
// CHECK-LABEL: func @transpose_load_to_rocdl_i16_memrefxi8 | ||
func.func @transpose_load_to_rocdl_i16_memrefxi8(%idx1 : index, %idx2 : index, %wgmem : memref<128x32xi8, 3>) -> vector<4xi16> { | ||
// CHECK: rocdl.ds.read.tr16.b64 | ||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported | ||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x32xi8, 3> -> vector<4xi16> | ||
return %0 : vector<4xi16> | ||
} | ||
krzysz00 marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx950 2>&1 | FileCheck %s | ||
|
||
// ----- | ||
|
||
func.func @transpose_load_to_rocdl_16xi4(%idx1 : index, %idx2 : index, %wgmem : memref<128x16xi4, 3>) -> vector<16xi4> { | ||
// CHECK: memref to have at least 8 bits element size, got 4 | ||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x16xi4, 3> -> vector<16xi4> | ||
return %0 : vector<16xi4> | ||
} | ||
|
||
// ----- | ||
|
||
func.func @transpose_load_to_rocdl_16xi6(%idx1 : index, %idx2 : index, %wgmem : memref<128x32xi6, 3>) -> vector<16xi6> { | ||
// CHECK: memref to have at least 8 bits element size, got 6 | ||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<16xi6> | ||
return %0 : vector<16xi6> | ||
} |
Uh oh!
There was an error while loading. Please reload this page.