diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h index 061cac249c906..15921528c960a 100644 --- a/clang/include/clang/Driver/Action.h +++ b/clang/include/clang/Driver/Action.h @@ -772,7 +772,14 @@ class FileTableTformJobAction : public JobAction { public: struct Tform { - enum Kind { EXTRACT, EXTRACT_DROP_TITLE, REPLACE, RENAME }; + enum Kind { + EXTRACT, + EXTRACT_DROP_TITLE, + REPLACE, + REPLACE_CELL, + RENAME, + COPY_SINGLE_FILE + }; Tform() = default; Tform(Kind K, std::initializer_list Args) : TheKind(K) { @@ -794,10 +801,19 @@ class FileTableTformJobAction : public JobAction { // from another file table passed as input to this action. void addReplaceColumnTform(StringRef From, StringRef To); + // Replaces a cell in this table with column title and row + // with the file name passed as input to this action. + void addReplaceCellTform(StringRef ColumnName, int Row); + // Renames a column with title in this table with a column with title // passed as input to this action. void addRenameColumnTform(StringRef From, StringRef To); + // Specifies that, instead of generating a new table, the transformation + // should copy the file at column and row into the + // output file. + void addCopySingleFileTform(StringRef ColumnName, int Row); + static bool classof(const Action *A) { return A->getKind() == FileTableTformJobClass; } @@ -806,6 +822,9 @@ class FileTableTformJobAction : public JobAction { private: SmallVector Tforms; // transformation actions requested + + // column to copy single file from if requested + std::string CopySingleFileColumnName; }; class AppendFooterJobAction : public JobAction { diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp index a330722d705de..4ff09ca4e216b 100644 --- a/clang/lib/Driver/Action.cpp +++ b/clang/lib/Driver/Action.cpp @@ -507,11 +507,23 @@ void FileTableTformJobAction::addReplaceColumnTform(StringRef From, Tforms.emplace_back(Tform(Tform::REPLACE, {From, To})); } +void FileTableTformJobAction::addReplaceCellTform(StringRef ColumnName, + int Row) { + Tforms.emplace_back( + Tform(Tform::REPLACE_CELL, {ColumnName, std::to_string(Row)})); +} + void FileTableTformJobAction::addRenameColumnTform(StringRef From, StringRef To) { Tforms.emplace_back(Tform(Tform::RENAME, {From, To})); } +void FileTableTformJobAction::addCopySingleFileTform(StringRef ColumnName, + int Row) { + Tforms.emplace_back( + Tform(Tform::COPY_SINGLE_FILE, {ColumnName, std::to_string(Row)})); +} + void AppendFooterJobAction::anchor() {} AppendFooterJobAction::AppendFooterJobAction(Action *Input, types::ID Type) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 42f1ea275d8e7..6112ea4902d76 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4378,19 +4378,19 @@ class OffloadingActionBuilder final { // .--------------------------------------. // | PostLink | // .--------------------------------------. - // [.n] [+*] [+*] + // [+n] [+*] [+] // | | | - // | .-----------------. | - // | | FileTableTform | | - // | | (extract "Code")| | - // | .-----------------. | - // | [-] | + // .----------------. .-----------------. | + // | FileTableTform | | FileTableTform | | + // | (copy "Code") | | (extract "Code")| | + // .----------------. .-----------------. | + // [.] [-] | // | | | - // | [-*] | - // .-------------. .-------------------. | - // |finalizeNVPTX| | SPIRVTranslator | | - // .-------------. .-------------------. | - // | [-as] [-!a] | + // [.] [-*] | + // .---------------. .-------------------. | + // | finalizeNVPTX | | SPIRVTranslator | | + // .---------------. .-------------------. | + // [.] [-as] [-!a] | // | | | | // | [-s] | | // | .----------------. | | @@ -4398,13 +4398,13 @@ class OffloadingActionBuilder final { // | .----------------. | | // | [-s] | | // | | | | - // | [-a] [-!a] [+] - // | .--------------------. - // | | FileTableTform | - // | | (replace "Code") | - // | .--------------------. - // | | - // [.n] [+*] + // [.] [-a] [-!a] [+] + // .------------------------------------. + // | FileTableTform | + // | (replace "Code") | + // .------------------------------------. + // | + // [+] // .--------------------------------------. // | OffloadWrapper | // .--------------------------------------. @@ -4451,24 +4451,40 @@ class OffloadingActionBuilder final { ActionList WrapperInputs; // post link is not optional - even if not splitting, always need to // process specialization constants - types::ID PostLinkOutType = - isNVPTX || isAMDGCN ? types::TY_LLVM_BC : types::TY_Tempfiletable; auto *PostLinkAction = C.MakeAction( - FullDeviceLinkAction, PostLinkOutType); + FullDeviceLinkAction, types::TY_Tempfiletable); PostLinkAction->setRTSetsSpecConstants(!isAOT); - if (isNVPTX) { - Action *FinAction = - finalizeNVPTXDependences(PostLinkAction, (*TC)->getTriple()); - WrapperInputs.push_back(FinAction); - } else if (isAMDGCN) { - Action *FinAction = - finalizeAMDGCNDependences(PostLinkAction, (*TC)->getTriple()); - WrapperInputs.push_back(FinAction); + constexpr char COL_CODE[] = "Code"; + + if (isNVPTX || isAMDGCN) { + // Make extraction copy the only remaining code file instead of + // creating a new table with a single entry. + // TODO: Process all PTX code files in file table to enable code + // splitting for PTX target. + auto *ExtractIRFilesAction = C.MakeAction( + PostLinkAction, types::TY_LLVM_BC); + ExtractIRFilesAction->addCopySingleFileTform(COL_CODE, 0); + + Action *FinAction; + if (isNVPTX) { + FinAction = finalizeNVPTXDependences(ExtractIRFilesAction, + (*TC)->getTriple()); + } else /* isAMDGCN */ { + FinAction = finalizeAMDGCNDependences(ExtractIRFilesAction, + (*TC)->getTriple()); + } + ActionList TformInputs{PostLinkAction, FinAction}; + + // Replace the only code entry in the table, as confirmed by the + // previous transformation. + auto *ReplaceFilesAction = C.MakeAction( + TformInputs, types::TY_Tempfiletable); + ReplaceFilesAction->addReplaceCellTform(COL_CODE, 0); + WrapperInputs.push_back(ReplaceFilesAction); } else { // For SPIRV-based targets - translate to SPIRV then optionally // compile ahead-of-time to native architecture - constexpr char COL_CODE[] = "Code"; auto *ExtractIRFilesAction = C.MakeAction( PostLinkAction, types::TY_Tempfilelist); // single column w/o title fits TY_Tempfilelist format @@ -4513,6 +4529,7 @@ class OffloadingActionBuilder final { ReplaceFilesAction->addReplaceColumnTform(COL_CODE, COL_CODE); WrapperInputs.push_back(ReplaceFilesAction); } + // After the Link, wrap the files before the final host link auto *DeviceWrappingAction = C.MakeAction( WrapperInputs, types::TY_Object); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index d6bf82935943b..285d80a133eb9 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8859,6 +8859,9 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA, TCArgs.hasFlag(options::OPT_fsycl_dead_args_optimization, options::OPT_fno_sycl_dead_args_optimization, false)) addArgs(CmdArgs, TCArgs, {"-emit-param-info"}); + // Enable PI program metadata + if (getToolChain().getTriple().isNVPTX()) + addArgs(CmdArgs, TCArgs, {"-emit-program-metadata"}); if (JA.getType() == types::TY_LLVM_BC) { // single file output requested - this means only perform necessary IR // transformations (like specialization constant intrinsic lowering) and @@ -8945,6 +8948,15 @@ void FileTableTform::ConstructJob(Compilation &C, const JobAction &JA, addArgs(CmdArgs, TCArgs, {Arg}); break; } + case FileTableTformJobAction::Tform::REPLACE_CELL: { + assert(Tf.TheArgs.size() == 2 && "column name and row id expected"); + SmallString<128> Arg("-replace_cell="); + Arg += Tf.TheArgs[0]; + Arg += ","; + Arg += Tf.TheArgs[1]; + addArgs(CmdArgs, TCArgs, {Arg}); + break; + } case FileTableTformJobAction::Tform::RENAME: { assert(Tf.TheArgs.size() == 2 && "from/to names expected"); SmallString<128> Arg("-rename="); @@ -8954,8 +8966,18 @@ void FileTableTform::ConstructJob(Compilation &C, const JobAction &JA, addArgs(CmdArgs, TCArgs, {Arg}); break; } + case FileTableTformJobAction::Tform::COPY_SINGLE_FILE: { + assert(Tf.TheArgs.size() == 2 && "column name and row id expected"); + SmallString<128> Arg("-copy_single_file="); + Arg += Tf.TheArgs[0]; + Arg += ","; + Arg += Tf.TheArgs[1]; + addArgs(CmdArgs, TCArgs, {Arg}); + break; + } } } + // 2) add output option assert(Output.isFilename() && "table tform output must be a file"); addArgs(CmdArgs, TCArgs, {"-o", Output.getFilename()}); diff --git a/clang/test/Driver/sycl-offload-amdgcn.cpp b/clang/test/Driver/sycl-offload-amdgcn.cpp index 85884e27524b6..cbfaad9980337 100644 --- a/clang/test/Driver/sycl-offload-amdgcn.cpp +++ b/clang/test/Driver/sycl-offload-amdgcn.cpp @@ -28,10 +28,12 @@ // CHK-PHASES-NO-CC: 9: assembler, {8}, object, (host-sycl) // CHK-PHASES-NO-CC: 10: linker, {9}, image, (host-sycl) // CHK-PHASES-NO-CC: 11: linker, {5}, ir, (device-sycl) -// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl) -// CHK-PHASES-NO-CC: 13: backend, {12}, assembler, (device-sycl) -// CHK-PHASES-NO-CC: 14: assembler, {13}, object, (device-sycl) -// CHK-PHASES-NO-CC: 15: linker, {14}, image, (device-sycl) -// CHK-PHASES-NO-CC: 16: linker, {15}, hip-fatbin, (device-sycl) -// CHK-PHASES-NO-CC: 17: clang-offload-wrapper, {16}, object, (device-sycl) -// CHK-PHASES-NO-CC: 18: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {17}, image +// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, tempfiletable, (device-sycl) +// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl) +// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl) +// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl) +// CHK-PHASES-NO-CC: 16: linker, {15}, image, (device-sycl) +// CHK-PHASES-NO-CC: 17: linker, {16}, hip-fatbin, (device-sycl) +// CHK-PHASES-NO-CC: 18: file-table-tform, {12, 17}, tempfiletable, (device-sycl) +// CHK-PHASES-NO-CC: 19: clang-offload-wrapper, {18}, object, (device-sycl) +// CHK-PHASES-NO-CC: 20: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {19}, image diff --git a/clang/test/Driver/sycl-offload-nvptx.cpp b/clang/test/Driver/sycl-offload-nvptx.cpp index 3f683d41fa136..fb8fb31a68cea 100644 --- a/clang/test/Driver/sycl-offload-nvptx.cpp +++ b/clang/test/Driver/sycl-offload-nvptx.cpp @@ -28,10 +28,12 @@ // CHK-PHASES-NO-CC: 9: assembler, {8}, object, (host-sycl) // CHK-PHASES-NO-CC: 10: linker, {9}, image, (host-sycl) // CHK-PHASES-NO-CC: 11: linker, {5}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 13: backend, {12}, assembler, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 14: clang-offload-wrapper, {13}, object, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 15: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (nvptx64-nvidia-nvcl-sycldevice:sm_50)" {14}, image +// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, tempfiletable, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 15: file-table-tform, {12, 14}, tempfiletable, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 16: clang-offload-wrapper, {15}, object, (device-sycl, sm_50) +// CHK-PHASES-NO-CC: 17: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (nvptx64-nvidia-nvcl-sycldevice:sm_50)" {16}, image /// Check phases specifying a compute capability. // RUN: %clangxx -ccc-print-phases -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \ @@ -50,7 +52,9 @@ // CHK-PHASES: 9: assembler, {8}, object, (host-sycl) // CHK-PHASES: 10: linker, {9}, image, (host-sycl) // CHK-PHASES: 11: linker, {5}, ir, (device-sycl, sm_35) -// CHK-PHASES: 12: sycl-post-link, {11}, ir, (device-sycl, sm_35) -// CHK-PHASES: 13: backend, {12}, assembler, (device-sycl, sm_35) -// CHK-PHASES: 14: clang-offload-wrapper, {13}, object, (device-sycl, sm_35) -// CHK-PHASES: 15: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (nvptx64-nvidia-nvcl-sycldevice:sm_35)" {14}, image +// CHK-PHASES: 12: sycl-post-link, {11}, tempfiletable, (device-sycl, sm_35) +// CHK-PHASES: 13: file-table-tform, {12}, ir, (device-sycl, sm_35) +// CHK-PHASES: 14: backend, {13}, assembler, (device-sycl, sm_35) +// CHK-PHASES: 15: file-table-tform, {12, 14}, tempfiletable, (device-sycl, sm_35) +// CHK-PHASES: 16: clang-offload-wrapper, {15}, object, (device-sycl, sm_35) +// CHK-PHASES: 17: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (nvptx64-nvidia-nvcl-sycldevice:sm_35)" {16}, image diff --git a/llvm/include/llvm/Support/PropertySetIO.h b/llvm/include/llvm/Support/PropertySetIO.h index 3c332e3ca1f0b..ee5c8443deb06 100644 --- a/llvm/include/llvm/Support/PropertySetIO.h +++ b/llvm/include/llvm/Support/PropertySetIO.h @@ -188,6 +188,7 @@ class PropertySetRegistry { "SYCL/specialization constants default values"; static constexpr char SYCL_DEVICELIB_REQ_MASK[] = "SYCL/devicelib req mask"; static constexpr char SYCL_KERNEL_PARAM_OPT_INFO[] = "SYCL/kernel param opt"; + static constexpr char SYCL_PROGRAM_METADATA[] = "SYCL/program metadata"; static constexpr char SYCL_MISC_PROP[] = "SYCL/misc properties"; static constexpr char SYCL_ASSERT_USED[] = "SYCL/assert used"; diff --git a/llvm/include/llvm/Support/SimpleTable.h b/llvm/include/llvm/Support/SimpleTable.h index 85ce21beb53ca..e91f288a6bc26 100644 --- a/llvm/include/llvm/Support/SimpleTable.h +++ b/llvm/include/llvm/Support/SimpleTable.h @@ -97,6 +97,9 @@ class SimpleTable { Error replaceColumn(StringRef Name, const SimpleTable &Src, StringRef SrcName = ""); + // Replaces the value in a cell at a given column and row with the new value. + Error updateCellValue(StringRef ColName, int Row, StringRef NewValue); + // Renames a column. Error renameColumn(StringRef OldName, StringRef NewName); diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp index 03ad6bb0b4cf9..1e92f1f8b9e3f 100644 --- a/llvm/lib/Support/PropertySetIO.cpp +++ b/llvm/lib/Support/PropertySetIO.cpp @@ -197,6 +197,7 @@ constexpr char PropertySetRegistry::SYCL_SPECIALIZATION_CONSTANTS[]; constexpr char PropertySetRegistry::SYCL_DEVICELIB_REQ_MASK[]; constexpr char PropertySetRegistry::SYCL_SPEC_CONSTANTS_DEFAULT_VALUES[]; constexpr char PropertySetRegistry::SYCL_KERNEL_PARAM_OPT_INFO[]; +constexpr char PropertySetRegistry::SYCL_PROGRAM_METADATA[]; constexpr char PropertySetRegistry::SYCL_MISC_PROP[]; constexpr char PropertySetRegistry::SYCL_ASSERT_USED[]; diff --git a/llvm/lib/Support/SimpleTable.cpp b/llvm/lib/Support/SimpleTable.cpp index a89087275c4d0..41aeb57b15d28 100644 --- a/llvm/lib/Support/SimpleTable.cpp +++ b/llvm/lib/Support/SimpleTable.cpp @@ -109,6 +109,16 @@ Error SimpleTable::replaceColumn(StringRef Name, const SimpleTable &Src, return Error::success(); } +Error SimpleTable::updateCellValue(StringRef ColName, int Row, + StringRef NewValue) { + if (getNumColumns() == 0) + return makeError("empty table"); + if (Row > getNumRows() || Row < 0) + return makeError("row index out of bounds"); + Rows[Row][getColumnId(ColName)] = NewValue.str(); + return Error::success(); +} + Error SimpleTable::renameColumn(StringRef OldName, StringRef NewName) { int I = getColumnId(OldName); diff --git a/llvm/test/tools/file-table-tform/Inputs/s.txt b/llvm/test/tools/file-table-tform/Inputs/s.txt new file mode 100644 index 0000000000000..babdf2fced655 --- /dev/null +++ b/llvm/test/tools/file-table-tform/Inputs/s.txt @@ -0,0 +1,2 @@ +[A|B|C|D] +aaa|bbb|100|XXX diff --git a/llvm/test/tools/file-table-tform/file-table-tform-single.test b/llvm/test/tools/file-table-tform/file-table-tform-single.test new file mode 100644 index 0000000000000..b64e4baf8c419 --- /dev/null +++ b/llvm/test/tools/file-table-tform/file-table-tform-single.test @@ -0,0 +1,8 @@ +-- Insert %S/Inputs/gold.txt into column A at row index 0 +RUN: file-table-tform --replace_cell=A,0 %S/Inputs/s.txt %S/Inputs/gold.txt -o t.txt + +-- Copy file in column A from the only row +RUN: file-table-tform --copy_single_file=A,0 t.txt -o u.txt + +-- Verify result +RUN: diff u.txt %S/Inputs/gold.txt diff --git a/llvm/test/tools/sycl-post-link/emit_program_metadata.ll b/llvm/test/tools/sycl-post-link/emit_program_metadata.ll new file mode 100644 index 0000000000000..93b71be47bd81 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/emit_program_metadata.ll @@ -0,0 +1,24 @@ +; This test checks that the post-link tool generates SYCL program metadata. +; +; RUN: sycl-post-link -emit-program-metadata -S %s -o %t.files.table +; RUN: FileCheck %s -input-file=%t.files.table --check-prefixes CHECK-TABLE +; RUN: FileCheck %s -input-file=%t.files_0.prop --match-full-lines --check-prefixes CHECK-PROP + +target triple = "spir64-unknown-unknown-sycldevice" + +!0 = !{i32 1, i32 2, i32 4} + +define weak_odr spir_kernel void @SpirKernel1(float %arg1) !reqd_work_group_size !0 { + call void @foo(float %arg1) + ret void +} + +declare void @foo(float) + +; CHECK-PROP: [SYCL/program metadata] +; // Base64 encoding in the prop file (including 8 bytes length): +; CHECK-PROP-NEXT: SpirKernel1@reqd_work_group_size=2|gBAAAAAAAAQAAAAACAAAAQAAAAA + +; CHECK-TABLE: [Code|Properties] +; CHECK-TABLE-NEXT: {{.*}}files_0.prop +; CHECK-TABLE-EMPTY: diff --git a/llvm/tools/file-table-tform/file-table-tform.cpp b/llvm/tools/file-table-tform/file-table-tform.cpp index 6cbccd37a4bf4..1ded228953a00 100644 --- a/llvm/tools/file-table-tform/file-table-tform.cpp +++ b/llvm/tools/file-table-tform/file-table-tform.cpp @@ -78,13 +78,20 @@ static cl::opt Output("o", cl::Required, cl::cat(FileTableTformCat)); static constexpr char OPT_REPLACE[] = "replace"; +static constexpr char OPT_REPLACE_CELL[] = "replace_cell"; static constexpr char OPT_RENAME[] = "rename"; static constexpr char OPT_EXTRACT[] = "extract"; +static constexpr char OPT_COPY_SINGLE_FILE[] = "copy_single_file"; static cl::list TformReplace{ OPT_REPLACE, cl::ZeroOrMore, cl::desc("replace a column"), cl::value_desc(""), cl::cat(FileTableTformCat)}; +static cl::list TformReplaceCell{ + OPT_REPLACE_CELL, cl::ZeroOrMore, cl::desc("replace a cell"), + cl::value_desc(","), + cl::cat(FileTableTformCat)}; + static cl::list TformRename{ OPT_RENAME, cl::ZeroOrMore, cl::desc("rename a column"), cl::value_desc(","), cl::cat(FileTableTformCat)}; @@ -94,6 +101,12 @@ static cl::list TformExtract{ cl::desc("extract column(s) identified by names"), cl::value_desc(",,..."), cl::cat(FileTableTformCat)}; +static cl::list TformCopySingleFile{ + OPT_COPY_SINGLE_FILE, cl::ZeroOrMore, + cl::desc("copy the file in a cell and make it the output"), + cl::value_desc(","), + cl::cat(FileTableTformCat)}; + static cl::opt DropTitles{"drop_titles", cl::Optional, cl::desc("drop column titles"), cl::cat(FileTableTformCat)}; @@ -152,8 +165,14 @@ struct TformCmd { [&](TformCmd *Cmd) { return Cmd->consumeSingleInput(Cur, End); }) + .Case(OPT_REPLACE_CELL, + [&](TformCmd *Cmd) { + return Cmd->consumeSingleInput(Cur, End); + }) .Case(OPT_RENAME, [&](TformCmd *Cmd) { return Error::success(); }) - .Case(OPT_EXTRACT, [&](TformCmd *Cmd) { return Error::success(); }); + .Case(OPT_EXTRACT, [&](TformCmd *Cmd) { return Error::success(); }) + .Case(OPT_COPY_SINGLE_FILE, + [&](TformCmd *Cmd) { return Error::success(); }); return F(this); } @@ -174,6 +193,18 @@ struct TformCmd { Twine(OPT_REPLACE)); return Error::success(); }) + .Case(OPT_REPLACE_CELL, + [&](TformCmd *Cmd) -> Error { + // argument is , + if (Arg.empty()) + return makeUserError("empty argument in " + + Twine(OPT_REPLACE_CELL)); + Arg.split(Args, ','); + if (Args.size() != 2 || Args[0].empty() || Args[1].empty()) + return makeUserError("invalid argument in " + + Twine(OPT_REPLACE_CELL)); + return Error::success(); + }) .Case(OPT_RENAME, [&](TformCmd *Cmd) -> Error { // argument is , @@ -188,15 +219,30 @@ struct TformCmd { Args.push_back(Names.second); return Error::success(); }) - .Case(OPT_EXTRACT, [&](TformCmd *Cmd) -> Error { - // argument is ,,... (1 or more) + .Case( + OPT_EXTRACT, + [&](TformCmd *Cmd) -> Error { + // argument is ,,... (1 or more) + if (Arg.empty()) + return makeUserError("empty argument in " + + Twine(OPT_RENAME)); + SmallVector Names; + Arg.split(Names, ','); + if (std::find(Names.begin(), Names.end(), "") != Names.end()) + return makeUserError("empty name in " + Twine(OPT_RENAME)); + std::copy(Names.begin(), Names.end(), + std::back_inserter(Args)); + return Error::success(); + }) + .Case(OPT_COPY_SINGLE_FILE, [&](TformCmd *Cmd) -> Error { + // argument is , if (Arg.empty()) - return makeUserError("empty argument in " + Twine(OPT_RENAME)); - SmallVector Names; - Arg.split(Names, ','); - if (std::find(Names.begin(), Names.end(), "") != Names.end()) - return makeUserError("empty name in " + Twine(OPT_RENAME)); - std::copy(Names.begin(), Names.end(), std::back_inserter(Args)); + return makeUserError("empty argument in " + + Twine(OPT_COPY_SINGLE_FILE)); + Arg.split(Args, ','); + if (Args.size() != 2 || Args[0].empty() || Args[1].empty()) + return makeUserError("invalid argument in " + + Twine(OPT_COPY_SINGLE_FILE)); return Error::success(); }); return F(this); @@ -217,6 +263,15 @@ struct TformCmd { Table.replaceColumn(Args[0], *Table1->get(), Args[1]); return Res ? std::move(Res) : std::move(Error::success()); }) + .Case(OPT_REPLACE_CELL, + [&](TformCmd *Cmd) -> Error { + // argument is , + assert(Args.size() == 2 && Cmd->Inputs.size() == 1); + const int Row = std::stoi(Args[1].str()); + Error Res = + Table.updateCellValue(Args[0], Row, Cmd->Inputs[0]); + return Res ? std::move(Res) : std::move(Error::success()); + }) .Case(OPT_RENAME, [&](TformCmd *Cmd) -> Error { // argument is , @@ -224,11 +279,30 @@ struct TformCmd { Error Res = Table.renameColumn(Args[0], Args[1]); return Res ? std::move(Res) : std::move(Error::success()); }) - .Case(OPT_EXTRACT, [&](TformCmd *Cmd) -> Error { - // argument is ,,... (1 or more) - assert(!Args.empty()); - Error Res = Table.peelColumns(Args); - return Res ? std::move(Res) : std::move(Error::success()); + .Case(OPT_EXTRACT, + [&](TformCmd *Cmd) -> Error { + // argument is ,,... (1 or more) + assert(!Args.empty()); + Error Res = Table.peelColumns(Args); + return Res ? std::move(Res) : std::move(Error::success()); + }) + .Case(OPT_COPY_SINGLE_FILE, [&](TformCmd *Cmd) -> Error { + // argument is , + assert(Args.size() == 2); + const int Row = std::stoi(Args[1].str()); + if (Row >= Table.getNumRows()) + return makeUserError("row index is out of bounds"); + + // Copy the file from the only remaining row at specified + // column + StringRef FileToCopy = Table[Row].getCell(Args[0], ""); + + if (FileToCopy.empty()) + return makeUserError("no file found in specified column"); + + std::error_code EC = sys::fs::copy_file(FileToCopy, Output); + return EC ? createFileError(Output, EC) + : std::move(Error::success()); }); return F(this); } @@ -253,13 +327,16 @@ int main(int argc, char **argv) { "File table transformation tool.\n" "Inputs and output of this tool is a \"file table\" files containing\n" "2D table of strings with optional row of column titles. Based on\n" - "transformation actions passed via the command line, the tool " - "transforms the first input file table and emits a new one as a result.\n" + "transformation actions passed via the command line, the tool\n" + "transforms the first input file table and emits either a new file\n" + "table or a copy of a file in a cell of the input table.\n" "\n" "Transformation actions are:\n" "- replace a column\n" + "- replace a cell\n" "- rename a column\n" - "- extract column(s)\n"); + "- extract column(s)\n" + "- ouput a copy of a file in a cell\n"); std::map Cmds; @@ -267,8 +344,9 @@ int main(int argc, char **argv) { // yet, as an order across all command line options-commands needs to be // established first to properly map inputs to commands. - auto Lists = {std::addressof(TformReplace), std::addressof(TformRename), - std::addressof(TformExtract)}; + auto Lists = {std::addressof(TformReplace), std::addressof(TformReplaceCell), + std::addressof(TformRename), std::addressof(TformExtract), + std::addressof(TformCopySingleFile)}; for (const auto *L : Lists) { for (auto It = L->begin(); It != L->end(); It++) { @@ -289,6 +367,16 @@ int main(int argc, char **argv) { CHECK_AND_EXIT(makeUserError("no inputs")); std::string &InputFile = *CurInput++; + // ensure that if copy_single_file is specified, it must be the last tform + bool HasCopySingleFileTform = false; + for (auto &P : Cmds) { + if (HasCopySingleFileTform) { + CHECK_AND_EXIT( + makeUserError("copy_single_file must be the last transformation")); + } + HasCopySingleFileTform = P.second->Kind == OPT_COPY_SINGLE_FILE; + } + for (auto &P : Cmds) { TformCmd::UPtrTy &Cmd = P.second; // this will advance cur iterator as far as needed @@ -308,16 +396,21 @@ int main(int argc, char **argv) { Error Res = Cmd->execute(*Table->get()); CHECK_AND_EXIT(std::move(Res)); } - // Finally, write the result - std::error_code EC; - raw_fd_ostream Out{Output, EC, sys::fs::OpenFlags::OF_None}; - if (EC) - CHECK_AND_EXIT(createFileError(Output, EC)); - Table->get()->write(Out, !DropTitles); + // If copy_single_file was specified the output file is generated by the + // corresponding transformation. + if (!HasCopySingleFileTform) { + // Write the transformed table to file + std::error_code EC; + raw_fd_ostream Out{Output, EC, sys::fs::OpenFlags::OF_None}; + + if (EC) + CHECK_AND_EXIT(createFileError(Output, EC)); + Table->get()->write(Out, !DropTitles); - if (Out.has_error()) - CHECK_AND_EXIT(createFileError(Output, Out.error())); - Out.close(); + if (Out.has_error()) + CHECK_AND_EXIT(createFileError(Output, Out.error())); + Out.close(); + } return 0; } diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index bdd27ec41b4bb..eacb63b4792ec 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -164,12 +164,17 @@ static cl::opt EmitKernelParamInfo{ "emit-param-info", cl::desc("emit kernel parameter optimization info"), cl::cat(PostLinkCat)}; +static cl::opt EmitProgramMetadata{"emit-program-metadata", + cl::desc("emit SYCL program metadata"), + cl::cat(PostLinkCat)}; + struct ImagePropSaveInfo { bool NeedDeviceLibReqMask; bool DoSpecConst; bool SetSpecConstAtRT; bool SpecConstsMet; bool EmitKernelParamInfo; + bool EmitProgramMetadata; bool IsEsimdKernel; }; @@ -376,6 +381,23 @@ static HasAssertStatus hasAssertInFunctionCallGraph(llvm::Function *Func) { return No_Assert; } +// Gets reqd_work_group_size information for function Func. +static std::vector +getKernelReqdWorkGroupSizeMetadata(const Function &Func) { + auto ReqdWorkGroupSizeMD = Func.getMetadata("reqd_work_group_size"); + if (!ReqdWorkGroupSizeMD) + return {}; + // TODO: Remove 3-operand assumption when it is relaxed. + assert(ReqdWorkGroupSizeMD->getNumOperands() == 3); + uint32_t X = mdconst::extract(ReqdWorkGroupSizeMD->getOperand(0)) + ->getZExtValue(); + uint32_t Y = mdconst::extract(ReqdWorkGroupSizeMD->getOperand(1)) + ->getZExtValue(); + uint32_t Z = mdconst::extract(ReqdWorkGroupSizeMD->getOperand(2)) + ->getZExtValue(); + return {X, Y, Z}; +} + // Input parameter KernelModuleMap is a map containing groups of kernels with // same values of the sycl-module-id attribute. ResSymbolsLists is a vector of // kernel name lists. Each vector element is a string with kernel names from the @@ -563,6 +585,24 @@ static string_vector saveDeviceImageProperty( } } + // Metadata names may be composite so we keep them alive until the + // properties have been written. + SmallVector MetadataNames; + if (ImgPSInfo.EmitProgramMetadata) { + auto &ProgramMetadata = + PropSet[llvm::util::PropertySetRegistry::SYCL_PROGRAM_METADATA]; + + // Add reqd_work_group_size information to program metadata + for (const Function &Func : ResultModules[I]->functions()) { + std::vector KernelReqdWorkGroupSize = + getKernelReqdWorkGroupSizeMetadata(Func); + if (KernelReqdWorkGroupSize.empty()) + continue; + MetadataNames.push_back(Func.getName().str() + "@reqd_work_group_size"); + ProgramMetadata.insert({MetadataNames.back(), KernelReqdWorkGroupSize}); + } + } + if (ImgPSInfo.IsEsimdKernel) { PropSet[llvm::util::PropertySetRegistry::SYCL_MISC_PROP].insert( {"isEsimdImage", true}); @@ -747,7 +787,8 @@ static TableFiles processOneModule(std::unique_ptr M, bool IsEsimd, { ImagePropSaveInfo ImgPSInfo = { true, DoSpecConst, SetSpecConstAtRT, - SpecConstsMet, EmitKernelParamInfo, IsEsimd}; + SpecConstsMet, EmitKernelParamInfo, EmitProgramMetadata, + IsEsimd}; string_vector Files = saveDeviceImageProperty(ResultModules, ImgPSInfo); std::copy(Files.begin(), Files.end(), std::back_inserter(TblFiles[COL_PROPS])); @@ -897,8 +938,10 @@ int main(int argc, char **argv) { bool DoSplitEsimd = SplitEsimd.getNumOccurrences() > 0; bool DoSpecConst = SpecConstLower.getNumOccurrences() > 0; bool DoParamInfo = EmitKernelParamInfo.getNumOccurrences() > 0; + bool DoProgMetadata = EmitProgramMetadata.getNumOccurrences() > 0; - if (!DoSplit && !DoSpecConst && !DoSymGen && !DoParamInfo && !DoSplitEsimd) { + if (!DoSplit && !DoSpecConst && !DoSymGen && !DoParamInfo && + !DoProgMetadata && !DoSplitEsimd) { errs() << "no actions specified; try --help for usage info\n"; return 1; } @@ -922,6 +965,11 @@ int main(int argc, char **argv) { << " -" << IROutputOnly.ArgStr << "\n"; return 1; } + if (IROutputOnly && DoProgMetadata) { + errs() << "error: -" << EmitProgramMetadata.ArgStr << " can't be used with" + << " -" << IROutputOnly.ArgStr << "\n"; + return 1; + } SMDiagnostic Err; std::unique_ptr M = parseIRFile(InputFilename, Err, Context); // It is OK to use raw pointer here as we control that it does not outlive M diff --git a/llvm/unittests/Support/SimpleTableTest.cpp b/llvm/unittests/Support/SimpleTableTest.cpp index c17ea89ddfe28..3626ed9d864fa 100644 --- a/llvm/unittests/Support/SimpleTableTest.cpp +++ b/llvm/unittests/Support/SimpleTableTest.cpp @@ -46,6 +46,8 @@ TEST(SimpleTable, Operations) { auto ReplaceCodeWith = "a_0.spv\n" "a_1.spv\n"; + auto ReplaceSinglePropertyWith = "a_2.props"; + auto MemBuf = MemoryBuffer::getMemBuffer(Content); auto MemBufRepl = MemoryBuffer::getMemBuffer(ReplaceCodeWith); // Create tables from the strings above @@ -60,6 +62,11 @@ TEST(SimpleTable, Operations) { if (Error Err = Table->get()->replaceColumn("Code", *TableRepl->get(), "")) FAIL() << "SimpleTable::replaceColumn failed: " << Err << "\n"; + // -- Update cell + if (Error Err = Table->get()->updateCellValue("Properties", 1, + ReplaceSinglePropertyWith)) + FAIL() << "SimpleTable::updateCellValue failed: " << Err << "\n"; + // -- Add SmallVector NewCol = {"a_0.mnf", "a_1.mnf"}; if (Error Err = Table->get()->addColumn("Manifest", NewCol)) @@ -78,7 +85,7 @@ TEST(SimpleTable, Operations) { } auto Expected = "[Code|Properties|Manifest]\n" "a_0.spv|a_0.props|a_0.mnf\n" - "a_1.spv|a_1.props|a_1.mnf\n"; + "a_1.spv|a_2.props|a_1.mnf\n"; ASSERT_EQ(Result, Expected); } diff --git a/sycl/doc/CompilerAndRuntimeDesign.md b/sycl/doc/CompilerAndRuntimeDesign.md index 44874cfbde60f..e98233eb73951 100644 --- a/sycl/doc/CompilerAndRuntimeDesign.md +++ b/sycl/doc/CompilerAndRuntimeDesign.md @@ -465,9 +465,15 @@ factors. Each edge is also annotated with the input/output file type. The diagram does not show the `llvm-foreach` tool invocations for clarity. This tool invokes given command line over each file in a file list. In this diagram the tool is applied to `llvm-spirv` and AOT backend whenever the -input/output type is `TY_tempfilelist`. The second invocation of the -`file-table-tform` takes two inputs - the file table and a file list coming -either from `llvm-spirv` or from the AOT backend. +input/output type is `TY_tempfilelist` and the target is not PTX. +Following this, `file-table-tform` takes two inputs - the file table and a file +list coming either from `llvm-spirv` or from the AOT backend. +Targeting PTX currently only accepts a single input file for processing, so +`file-table-tform` is used to extract the code file from the file table, which +is then processed by the +["PTX target processing" step](#device-code-post-link-step-for-CUDA). +The resulting device binary is inserted back into the file table in place of the +extracted code file using `file-table-tform`. ##### Device code splitting @@ -547,7 +553,8 @@ objects for the CUDA target are linked together alongside using the NVPTX backend and assembled into a cubin using the `ptxas` tool (part of the CUDA SDK). The PTX file and cubin are assembled together using `fatbinary` to produce a CUDA fatbin. The CUDA fatbin -is then passed to the offload wrapper tool. +then replaces the llvm bitcode file in the file table generated by +`sycl-post-link`. The resulting table is passed to the offload wrapper tool. ![NVPTX AOT build](images/DevicePTXProcessing.svg) diff --git a/sycl/doc/images/DeviceLinkAndWrap.svg b/sycl/doc/images/DeviceLinkAndWrap.svg index 99ffe673132b5..10ea7704f79d8 100644 --- a/sycl/doc/images/DeviceLinkAndWrap.svg +++ b/sycl/doc/images/DeviceLinkAndWrap.svg @@ -9,7 +9,7 @@ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" sodipodi:docname="DeviceLinkAndWrap.svg" - inkscape:version="1.0.2 (e86c870879, 2021-01-15, custom)" + inkscape:version="1.0.2-2 (e86c870879, 2021-01-15)" id="svg8" version="1.1" viewBox="0 0 205.79753 221.03191" @@ -1494,6 +1494,339 @@ id="path2339" inkscape:connector-curvature="0" /> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id="base" + inkscape:snap-bbox="false" + inkscape:snap-grids="true" + inkscape:snap-global="false" /> @@ -1539,6 +1875,21 @@ id="g841" inkscape:label="pics_device_link_graph" transform="matrix(0.35277777,0,0,-0.35277777,-16.630949,187.55186)"> + + + @@ -1742,7 +2093,7 @@ @@ -1762,17 +2113,17 @@ id="path999" inkscape:connector-curvature="0" /> @@ -1782,37 +2133,32 @@ id="path1007" inkscape:connector-curvature="0" /> - @@ -1827,72 +2173,109 @@ id="path1025" inkscape:connector-curvature="0" /> - - + + + + + + + + + - + + + + + + + - - - - - - - - - - - - - - + transform="translate(-4.1152937,19.101767)" + id="g1355-2"> Split code + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + file + + + + + - + + + + + table + + + + + - + + + + + tform + + + copy “Code” + + + + All targets + + diff --git a/sycl/doc/images/DevicePTXProcessing.svg b/sycl/doc/images/DevicePTXProcessing.svg index 3e53d9e95c3d4..a639d2906a6c8 100644 --- a/sycl/doc/images/DevicePTXProcessing.svg +++ b/sycl/doc/images/DevicePTXProcessing.svg @@ -9,7 +9,7 @@ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" sodipodi:docname="DevicePTXProcessing.svg" - inkscape:version="0.92.3 (2405546, 2018-03-11)" + inkscape:version="1.0.2-2 (e86c870879, 2021-01-15)" id="svg8" version="1.1" viewBox="0 0 205.79753 221.03191" @@ -31,21 +31,6 @@ d="M 5.77,0.0 L -2.88,5.0 L -2.88,-5.0 L 5.77,0.0 z " id="path4942" /> - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,194.96288)"> @@ -2049,7 +2287,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,194.96288)"> @@ -2071,73 +2309,71 @@ + style="fill:#4472c4;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="M 90.140434,11.109217 H 126.9281 v 8.078611 l -1.61572,1.615722 H 90.140434 Z" /> + style="fill:#375c9e;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 125.31238,20.80355 0.32455,-1.291166 1.29117,-0.324556 z" /> - Device LLVM IR - + style="stroke-width:0.352778" + x="99.965302" + y="15.401005">File table + style="fill:#4472c4;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778" + d="m 108.97877,5.5423837 v 3.5489439 h -0.80434 V 5.5423837 Z m 0.80433,3.1467782 -1.2065,2.4130001 -1.2065,-2.4130001 z" /> + style="fill:#4472c4;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778" + d="m 108.97877,20.824717 v 4.780139 h -0.80434 v -4.780139 z m 0.80433,4.377972 -1.2065,2.413 -1.2065,-2.413 z" /> + style="fill:#428374;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 65.824475,64.217895 c 0,-0.807861 0.814917,-1.4605 1.820334,-1.4605 1.005416,0 1.820333,0.652639 1.820333,1.4605 0,0.807861 -0.814917,1.4605 -1.820333,1.4605 -1.005417,0 -1.820334,-0.652639 -1.820334,-1.4605 z" /> + style="fill:#428374;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 97.99781,64.260228 c 0,-0.807861 0.81491,-1.4605 1.82033,-1.4605 1.00542,0 1.82033,0.652639 1.82033,1.4605 0,0.807861 -0.81491,1.4605 -1.82033,1.4605 -1.00542,0 -1.82033,-0.652639 -1.82033,-1.4605 z" /> + style="fill:#428374;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 129.91714,64.217895 c 0,-0.807861 0.81492,-1.4605 1.82033,-1.4605 1.00542,0 1.82034,0.652639 1.82034,1.4605 0,0.807861 -0.81492,1.4605 -1.82034,1.4605 -1.00541,0 -1.82033,-0.652639 -1.82033,-1.4605 z" /> + style="fill:#428374;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 158.74614,64.514228 c 0,-0.807861 0.81492,-1.4605 1.82033,-1.4605 1.00542,0 1.82034,0.652639 1.82034,1.4605 0,0.807861 -0.81492,1.4605 -1.82034,1.4605 -1.00541,0 -1.82033,-0.652639 -1.82033,-1.4605 z" /> + style="fill:#428374;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 98.607104,182.9885 c 0,-0.80786 0.81492,-1.4605 1.820326,-1.4605 1.00542,0 1.82034,0.65264 1.82034,1.4605 0,0.80786 -0.81492,1.4605 -1.82034,1.4605 -1.005406,0 -1.820326,-0.65264 -1.820326,-1.4605 z" /> + style="fill:#428374;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 139.92443,182.81917 c 0,-0.80786 0.81492,-1.4605 1.82034,-1.4605 1.00541,0 1.82033,0.65264 1.82033,1.4605 0,0.80786 -0.81492,1.4605 -1.82033,1.4605 -1.00542,0 -1.82034,-0.65264 -1.82034,-1.4605 z" /> + style="fill:#428374;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 66.391439,183.03083 c 0,-0.80786 0.814917,-1.4605 1.820334,-1.4605 1.005416,0 1.820333,0.65264 1.820333,1.4605 0,0.80786 -0.814917,1.4605 -1.820333,1.4605 -1.005417,0 -1.820334,-0.65264 -1.820334,-1.4605 z" /> + transform="matrix(0.35277777,0,0,-0.35277777,-8.162362,195.32856)"> @@ -2158,7 +2394,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-7.104029,195.09572)"> @@ -2179,7 +2415,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-7.104029,195.09572)"> - Clang - + x="101.31163" + y="84.249756">Clang + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> @@ -2220,7 +2454,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> @@ -2241,7 +2475,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> - clang - + x="0 7.7220001 15.444 23.166 30.94416">clang + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> - - - + x="0">- + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> - offload - + x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.528561">offload + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> - - - + x="0">- + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> - wrapper - + x="0 7.7922001 15.5142 23.2362 30.9582 38.736359 46.458359">wrapper + transform="matrix(0.35277777,0,0,-0.35277777,-47.755856,220.70306)"> @@ -2359,7 +2583,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-47.755856,220.70306)"> @@ -2380,7 +2604,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-47.755856,220.70306)"> - PTX target processing - + x="81.597427" + y="60.97049">PTX target processing + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> @@ -2421,7 +2643,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> @@ -2443,34 +2665,32 @@ + style="fill:#ffc000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="M 90.140434,199.98533 H 126.9281 v 12.41778 l -2.48355,2.48356 H 90.140434 Z" /> + style="fill:#cd9a00;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 124.44455,214.88667 0.49741,-1.98685 1.98614,-0.49671 z" /> + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> - Wrapper object - + x="0 12.11652 16.79184 23.517 30.831841 38.146679 45.138599 50.038559 53.141399 60.540482 67.911484 71.267036 78.258957 84.099602">Wrapper object + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> @@ -2491,7 +2711,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> @@ -2513,104 +2733,97 @@ + style="fill:#4472c4;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="M 93.315434,206.88567 H 123.7531 v 6.6675 l -1.3335,1.3335 H 93.315434 Z" /> + style="fill:#375c9e;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.352778" + d="m 122.4196,214.88667 0.26811,-1.0668 1.06539,-0.2667 z" /> + transform="matrix(0.35277777,0,0,-0.35277777,-47.188892,221.19433)"> - Device code - + x="0 8.6493597 15.484464 21.841393 25.076113 30.968927 37.930607 41.151264 46.917503 54.329231 61.712833">Device code + style="fill:#4472c4;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778" + d="m 108.97877,194.20683 v 3.78954 h -0.80434 v -3.78954 z m 0.80433,3.38738 -1.2065,2.413 -1.2065,-2.413 z" /> + style="fill:#4472c4;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.352778" + d="m 108.97877,214.90783 v 3.21116 h -0.80434 v -3.21116 z m 0.80433,2.809 -1.2065,2.413 -1.2065,-2.413 z" /> - (from sycl-post-link) - + x="28.692085" + y="17.476978">(from sycl-post-link) + id="g1668" + transform="translate(-29.00824,13.042864)"> - libspirv.bc - + sodipodi:role="line">libspirv.bc + transform="translate(-29.00824,11.98453)"> - libdevice.bc - + sodipodi:role="line">libdevice.bc + transform="matrix(0.35277777,0,0,-0.35277777,-7.4064096,222.43894)"> @@ -2631,7 +2844,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-6.3480766,222.2061)"> @@ -2652,7 +2865,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-6.3480766,222.2061)"> - ptxas - + x="102.06758" + y="113.47669">ptxas + transform="matrix(0.35277777,0,0,-0.35277777,-7.4064096,246.33646)"> @@ -2693,7 +2904,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-6.3480766,246.10362)"> @@ -2714,7 +2925,7 @@ + transform="matrix(0.35277777,0,0,-0.35277777,-6.3480766,246.10362)"> - fatbin - + x="102.06758" + y="137.37424">fatbin - - ptx - + id="tspan3794">ptx - cubin - + id="tspan3916">cubin - - LLVM IR - + x="110.90725" + y="73.327454">LLVM IR - CUDA fatbin - + id="tspan2303">CUDA fatbin - (to host linker) - + x="31.587437" + y="216.68318">(to host linker) - (nvptx backend) - + x="91.899139" + y="88.973877">(nvptx backend) - (linked) - + x="97.467636" + y="19.142481">(Single row) + + + + + + + + + + + + + + + + + + + + + + file-table-tform + (Copy "Code") + LLVM IR + + + + + + + + + + + + + + + + + + + + + + file-table-tform + (Replace "Code") + File table + diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 49c44946000f0..9a0e9470e7bda 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -689,9 +689,16 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4; #define __SYCL_PI_PROPERTY_SET_DEVICELIB_REQ_MASK "SYCL/devicelib req mask" /// PropertySetRegistry::SYCL_KERNEL_PARAM_OPT_INFO defined in PropertySetIO.h #define __SYCL_PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO "SYCL/kernel param opt" +/// PropertySetRegistry::SYCL_KERNEL_PROGRAM_METADATA defined in PropertySetIO.h +#define __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA "SYCL/program metadata" /// PropertySetRegistry::SYCL_MISC_PROP defined in PropertySetIO.h #define __SYCL_PI_PROPERTY_SET_SYCL_MISC_PROP "SYCL/misc properties" +/// Program metadata tags recognized by the PI backends. For kernels the tag +/// must appear after the kernel name. +#define __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ + "@reqd_work_group_size" + /// This struct is a record of the device binary information. If the Kind field /// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec /// field can still be specific and denote e.g. FPGA target. It must match the @@ -1108,9 +1115,26 @@ __SYCL_EXPORT pi_result piclProgramCreateWithSource(pi_context context, const size_t *lengths, pi_program *ret_program); +/// Creates a PI program for a context and loads the given binary into it. +/// +/// \param context is the PI context to associate the program with. +/// \param num_devices is the number of devices in device_list. +/// \param device_list is a pointer to a list of devices. These devices must all +/// be in context. +/// \param lengths is an array of sizes in bytes of the binary in binaries. +/// \param binaries is a pointer to a list of program binaries. +/// \param num_metadata_entries is the number of metadata entries in metadata. +/// \param metadata is a pointer to a list of program metadata entries. The +/// use of metadata entries is backend-defined. +/// \param binary_status returns whether the program binary was loaded +/// succesfully or not, for each device in device_list. +/// binary_status is ignored if it is null and otherwise +/// it must be an array of num_devices elements. +/// \param program is the PI program created from the program binaries. __SYCL_EXPORT pi_result piProgramCreateWithBinary( pi_context context, pi_uint32 num_devices, const pi_device *device_list, const size_t *lengths, const unsigned char **binaries, + size_t num_metadata_entries, const pi_device_binary_property *metadata, pi_int32 *binary_status, pi_program *ret_program); __SYCL_EXPORT pi_result piProgramGetInfo(pi_program program, diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp index 4d621d2e33113..a0f553309b5cf 100644 --- a/sycl/include/CL/sycl/detail/pi.hpp +++ b/sycl/include/CL/sycl/detail/pi.hpp @@ -358,6 +358,7 @@ class DeviceBinaryImage { const PropertyRange &getKernelParamOptInfo() const { return KernelParamOptInfo; } + const PropertyRange &getProgramMetadata() const { return ProgramMetadata; } virtual ~DeviceBinaryImage() {} protected: @@ -369,6 +370,7 @@ class DeviceBinaryImage { DeviceBinaryImage::PropertyRange SpecConstIDMap; DeviceBinaryImage::PropertyRange DeviceLibReqMask; DeviceBinaryImage::PropertyRange KernelParamOptInfo; + DeviceBinaryImage::PropertyRange ProgramMetadata; }; /// Tries to determine the device binary image foramat. Returns diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index ba7e57ee7fbf3..8e49494d13f83 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -475,13 +475,53 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { } _pi_program::_pi_program(pi_context ctxt) - : module_{nullptr}, binary_{}, - binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} { + : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1}, + context_{ctxt}, kernelReqdWorkGroupSizeMD_{} { cuda_piContextRetain(context_); } _pi_program::~_pi_program() { cuda_piContextRelease(context_); } +bool get_kernel_metadata(std::string metadataName, const char *tag, + std::string &kernelName) { + const size_t tagLength = strlen(tag); + const size_t metadataNameLength = metadataName.length(); + if (metadataNameLength >= tagLength && + metadataName.compare(metadataNameLength - tagLength, tagLength, tag) == + 0) { + kernelName = metadataName.substr(0, metadataNameLength - tagLength); + return true; + } + return false; +} + +pi_result _pi_program::set_metadata(const pi_device_binary_property *metadata, + size_t length) { + for (size_t i = 0; i < length; ++i) { + const pi_device_binary_property metadataElement = metadata[i]; + std::string metadataElementName{metadataElement->Name}; + std::string kernelName; + + // If metadata is reqd_work_group_size record it for the corresponding + // kernel name. + if (get_kernel_metadata(metadataElementName, + __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE, + kernelName)) { + assert(metadataElement->ValSize == + sizeof(std::uint64_t) + sizeof(std::uint32_t) * 3 && + "Unexpected size for reqd_work_group_size metadata"); + + // Get pointer to data, skipping 64-bit size at the start of the data. + const auto *reqdWorkGroupElements = + reinterpret_cast(metadataElement->ValAddr) + 2; + kernelReqdWorkGroupSizeMD_[kernelName] = + std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1], + reqdWorkGroupElements[2]); + } + } + return PI_SUCCESS; +} + pi_result _pi_program::set_binary(const char *source, size_t length) { assert((binary_ == nullptr && binarySizeInBytes_ == 0) && "Re-setting program binary data which has already been set"); @@ -530,6 +570,8 @@ pi_result _pi_program::build_program(const char *build_options) { /// Note: This is currently only being used by the SYCL program class for the /// has_kernel method, so an alternative would be to move the has_kernel /// query to PI and use cuModuleGetFunction to check for a kernel. +/// Note: Another alternative is to add kernel names as metadata, like with +/// reqd_work_group_size. std::string getKernelNames(pi_program program) { std::string source(program->binary_, program->binary_ + program->binarySizeInBytes_); @@ -2394,6 +2436,74 @@ pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index, return retErr; } +pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, + pi_kernel_group_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + + // Here we want to query about a kernel's cuda blocks! + + if (kernel != nullptr) { + + switch (param_name) { + case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int max_threads = 0; + cl::sycl::detail::pi::assertion( + cuFuncGetAttribute(&max_threads, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + size_t(max_threads)); + } + case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + size_t group_size[3] = {0, 0, 0}; + const auto &reqd_wg_size_md_map = + kernel->program_->kernelReqdWorkGroupSizeMD_; + const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_); + if (reqd_wg_size_md != reqd_wg_size_md_map.end()) { + const auto reqd_wg_size = reqd_wg_size_md->second; + group_size[0] = std::get<0>(reqd_wg_size); + group_size[1] = std::get<1>(reqd_wg_size); + group_size[2] = std::get<2>(reqd_wg_size); + } + return getInfoArray(3, param_value_size, param_value, + param_value_size_ret, group_size); + } + case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == CUDA SHARED + int bytes = 0; + cl::sycl::detail::pi::assertion( + cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int warpSize = 0; + cl::sycl::detail::pi::assertion( + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, + device->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + static_cast(warpSize)); + } + case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == CUDA LOCAL + int bytes = 0; + cl::sycl::detail::pi::assertion( + cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, + kernel->get()) == CUDA_SUCCESS); + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_uint64(bytes)); + } + default: + __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); + } + } + + return PI_INVALID_KERNEL; +} + pi_result cuda_piEnqueueKernelLaunch( pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim, const size_t *global_work_offset, const size_t *global_work_size, @@ -2413,10 +2523,17 @@ pi_result cuda_piEnqueueKernelLaunch( int threadsPerBlock[3] = {32, 1, 1}; size_t maxWorkGroupSize = 0u; size_t maxThreadsPerBlock[3] = {}; + size_t reqdThreadsPerBlock[3] = {}; bool providedLocalWorkGroupSize = (local_work_size != nullptr); { - pi_result retError = cuda_piDeviceGetInfo( + pi_result retError = cuda_piKernelGetGroupInfo( + kernel, command_queue->device_, + PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(reqdThreadsPerBlock), reqdThreadsPerBlock, nullptr); + assert(retError == PI_SUCCESS); + + retError = cuda_piDeviceGetInfo( command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES, sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr); assert(retError == PI_SUCCESS); @@ -2429,6 +2546,10 @@ pi_result cuda_piEnqueueKernelLaunch( if (providedLocalWorkGroupSize) { auto isValid = [&](int dim) { + if (reqdThreadsPerBlock[dim] != 0 && + local_work_size[dim] != reqdThreadsPerBlock[dim]) + return PI_INVALID_WORK_GROUP_SIZE; + if (local_work_size[dim] > maxThreadsPerBlock[dim]) return PI_INVALID_WORK_ITEM_SIZE; // Checks that local work sizes are a divisor of the global work sizes @@ -2756,6 +2877,7 @@ pi_result cuda_piProgramCreate(pi_context, const void *, size_t, pi_program *) { pi_result cuda_piProgramCreateWithBinary( pi_context context, pi_uint32 num_devices, const pi_device *device_list, const size_t *lengths, const unsigned char **binaries, + size_t num_metadata_entries, const pi_device_binary_property *metadata, pi_int32 *binary_status, pi_program *program) { // Ignore unused parameter (void)binary_status; @@ -2773,6 +2895,8 @@ pi_result cuda_piProgramCreateWithBinary( std::unique_ptr<_pi_program> retProgram{new _pi_program{context}}; + retProgram->set_metadata(metadata, num_metadata_entries); + const bool has_length = (lengths != nullptr); size_t length = has_length ? lengths[0] @@ -3050,71 +3174,6 @@ pi_result cuda_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name, return PI_INVALID_KERNEL; } -pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device, - pi_kernel_group_info param_name, - size_t param_value_size, void *param_value, - size_t *param_value_size_ret) { - - // here we want to query about a kernel's cuda blocks! - - if (kernel != nullptr) { - - switch (param_name) { - case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int max_threads = 0; - cl::sycl::detail::pi::assertion( - cuFuncGetAttribute(&max_threads, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - size_t(max_threads)); - } - case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - // Returns the work-group size specified in the kernel source or IL. - // If the work-group size is not specified in the kernel source or IL, - // (0, 0, 0) is returned. - // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html - - // TODO: can we extract the work group size from the PTX? - size_t group_size[3] = {0, 0, 0}; - return getInfoArray(3, param_value_size, param_value, - param_value_size_ret, group_size); - } - case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == CUDA SHARED - int bytes = 0; - cl::sycl::detail::pi::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int warpSize = 0; - cl::sycl::detail::pi::assertion( - cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, - device->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - static_cast(warpSize)); - } - case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == CUDA LOCAL - int bytes = 0; - cl::sycl::detail::pi::assertion( - cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, - kernel->get()) == CUDA_SUCCESS); - return getInfo(param_value_size, param_value, param_value_size_ret, - pi_uint64(bytes)); - } - default: - __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name); - } - } - - return PI_INVALID_KERNEL; -} - pi_result cuda_piKernelGetSubGroupInfo( pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name, size_t input_value_size, const void *input_value, size_t param_value_size, diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 0804da17d2281..1ae3f9c464ada 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -504,6 +504,10 @@ struct _pi_program { std::atomic_uint32_t refCount_; _pi_context *context_; + // Metadata + std::unordered_map> + kernelReqdWorkGroupSizeMD_; + constexpr static size_t MAX_LOG_SIZE = 8192u; char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE]; @@ -513,6 +517,9 @@ struct _pi_program { _pi_program(pi_context ctxt); ~_pi_program(); + pi_result set_metadata(const pi_device_binary_property *metadata, + size_t length); + pi_result set_binary(const char *binary, size_t binarySizeInBytes); pi_result build_program(const char* build_options); diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 63fc720f49eee..79961f17fc6a1 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -740,6 +740,7 @@ pi_result piProgramCreate(pi_context, const void *, size_t, pi_program *) { pi_result piProgramCreateWithBinary(pi_context, pi_uint32, const pi_device *, const size_t *, const unsigned char **, + size_t, const pi_device_binary_property *, pi_int32 *, pi_program *) { DIE_NO_IMPLEMENTATION; return PI_SUCCESS; diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index e0c9c497477d2..c1fd48e9015b4 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2936,12 +2936,13 @@ pi_result piProgramCreate(pi_context Context, const void *ILBytes, return PI_SUCCESS; } -pi_result piProgramCreateWithBinary(pi_context Context, pi_uint32 NumDevices, - const pi_device *DeviceList, - const size_t *Lengths, - const unsigned char **Binaries, - pi_int32 *BinaryStatus, - pi_program *Program) { +pi_result piProgramCreateWithBinary( + pi_context Context, pi_uint32 NumDevices, const pi_device *DeviceList, + const size_t *Lengths, const unsigned char **Binaries, + size_t NumMetadataEntries, const pi_device_binary_property *Metadata, + pi_int32 *BinaryStatus, pi_program *Program) { + (void)Metadata; + (void)NumMetadataEntries; PI_ASSERT(Context, PI_INVALID_CONTEXT); PI_ASSERT(DeviceList && NumDevices, PI_INVALID_VALUE); diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 1d762aaa5fafb..82124b08f0c07 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -635,12 +635,13 @@ pi_result piclProgramCreateWithSource(pi_context context, pi_uint32 count, return ret_err; } -pi_result piProgramCreateWithBinary(pi_context context, pi_uint32 num_devices, - const pi_device *device_list, - const size_t *lengths, - const unsigned char **binaries, - pi_int32 *binary_status, - pi_program *ret_program) { +pi_result piProgramCreateWithBinary( + pi_context context, pi_uint32 num_devices, const pi_device *device_list, + const size_t *lengths, const unsigned char **binaries, + size_t num_metadata_entries, const pi_device_binary_property *metadata, + pi_int32 *binary_status, pi_program *ret_program) { + (void)metadata; + (void)num_metadata_entries; pi_result ret_err = PI_INVALID_OPERATION; *ret_program = cast(clCreateProgramWithBinary( diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp index 4f7b5a83523aa..026c1fa8d5f10 100644 --- a/sycl/plugins/rocm/pi_rocm.cpp +++ b/sycl/plugins/rocm/pi_rocm.cpp @@ -2742,6 +2742,7 @@ pi_result rocm_piProgramCreate(pi_context context, const void *il, pi_result rocm_piProgramCreateWithBinary( pi_context context, pi_uint32 num_devices, const pi_device *device_list, const size_t *lengths, const unsigned char **binaries, + size_t num_metadata_entries, const pi_device_binary_property *metadata, pi_int32 *binary_status, pi_program *program) { assert(context != nullptr); assert(binaries != nullptr); @@ -2756,6 +2757,9 @@ pi_result rocm_piProgramCreateWithBinary( std::unique_ptr<_pi_program> retProgram{new _pi_program{context}}; + // TODO: Set metadata here and use reqd_work_group_size information. + // See cuda_piProgramCreateWithBinary + const bool has_length = (lengths != nullptr); size_t length = has_length ? lengths[0] diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index aa77a7e819d00..e460146fc3644 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -690,6 +690,7 @@ void DeviceBinaryImage::init(pi_device_binary Bin) { SpecConstIDMap.init(Bin, __SYCL_PI_PROPERTY_SET_SPEC_CONST_MAP); DeviceLibReqMask.init(Bin, __SYCL_PI_PROPERTY_SET_DEVICELIB_REQ_MASK); KernelParamOptInfo.init(Bin, __SYCL_PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO); + ProgramMetadata.init(Bin, __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA); } } // namespace pi diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 298ae7e4e11d0..548fc80368cd7 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -64,10 +64,10 @@ ProgramManager &ProgramManager::getInstance() { return GlobalHandler::instance().getProgramManager(); } -static RT::PiProgram createBinaryProgram(const ContextImplPtr Context, - const device &Device, - const unsigned char *Data, - size_t DataLen) { +static RT::PiProgram +createBinaryProgram(const ContextImplPtr Context, const device &Device, + const unsigned char *Data, size_t DataLen, + const std::vector Metadata) { const detail::plugin &Plugin = Context->getPlugin(); #ifndef _NDEBUG pi_uint32 NumDevices = 0; @@ -84,7 +84,7 @@ static RT::PiProgram createBinaryProgram(const ContextImplPtr Context, pi_int32 BinaryStatus = CL_SUCCESS; Plugin.call( Context->getHandleRef(), 1 /*one binary*/, &PiDevice, &DataLen, &Data, - &BinaryStatus, &Program); + Metadata.size(), Metadata.data(), &BinaryStatus, &Program); if (BinaryStatus != CL_SUCCESS) { throw runtime_error("Creating program with binary failed.", BinaryStatus); @@ -339,12 +339,17 @@ RT::PiProgram ProgramManager::createPIProgram(const RTDeviceBinaryImage &Img, "SPIR-V online compilation is not supported in this context", PI_INVALID_OPERATION); + // Get program metadata from properties + auto ProgMetadata = Img.getProgramMetadata(); + std::vector ProgMetadataVector{ + ProgMetadata.begin(), ProgMetadata.end()}; + // Load the image const ContextImplPtr Ctx = getSyclObjImpl(Context); - RT::PiProgram Res = - Format == PI_DEVICE_BINARY_TYPE_SPIRV - ? createSpirvProgram(Ctx, RawImg.BinaryStart, ImgSize) - : createBinaryProgram(Ctx, Device, RawImg.BinaryStart, ImgSize); + RT::PiProgram Res = Format == PI_DEVICE_BINARY_TYPE_SPIRV + ? createSpirvProgram(Ctx, RawImg.BinaryStart, ImgSize) + : createBinaryProgram(Ctx, Device, RawImg.BinaryStart, + ImgSize, ProgMetadataVector); { std::lock_guard Lock(MNativeProgramsMutex); @@ -453,13 +458,18 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(OSModuleHandle M, const detail::plugin &Plugin = ContextImpl->getPlugin(); RT::PiProgram NativePrg; + // Get program metadata from properties + auto ProgMetadata = Img.getProgramMetadata(); + std::vector ProgMetadataVector{ + ProgMetadata.begin(), ProgMetadata.end()}; + auto BinProg = PersistentDeviceCodeCache::getItemFromDisc( Device, Img, SpecConsts, CompileOpts + LinkOpts); if (BinProg.size()) { // TODO: Build for multiple devices once supported by program manager NativePrg = createBinaryProgram(ContextImpl, Device, (const unsigned char *)BinProg[0].data(), - BinProg[0].size()); + BinProg[0].size(), ProgMetadataVector); } else { NativePrg = createPIProgram(Img, Context, Device); if (Prg) diff --git a/sycl/unittests/kernel-and-program/Cache.cpp b/sycl/unittests/kernel-and-program/Cache.cpp index 58d2a568a51c6..946f54f5de596 100644 --- a/sycl/unittests/kernel-and-program/Cache.cpp +++ b/sycl/unittests/kernel-and-program/Cache.cpp @@ -102,6 +102,7 @@ static pi_result redefinedProgramCreateWithSource(pi_context context, static pi_result redefinedProgramCreateWithBinary( pi_context context, pi_uint32 num_devices, const pi_device *device_list, const size_t *lengths, const unsigned char **binaries, + size_t metadata_length, const pi_device_binary_property *metadata, pi_int32 *binary_status, pi_program *ret_program) { *ret_program = reinterpret_cast(1); return PI_SUCCESS; diff --git a/sycl/unittests/misc/KernelBuildOptions.cpp b/sycl/unittests/misc/KernelBuildOptions.cpp index eeaf9b827226e..91339553acd97 100644 --- a/sycl/unittests/misc/KernelBuildOptions.cpp +++ b/sycl/unittests/misc/KernelBuildOptions.cpp @@ -142,6 +142,7 @@ static pi_result redefinedProgramCreateWithSource(pi_context context, static pi_result redefinedProgramCreateWithBinary( pi_context context, pi_uint32 num_devices, const pi_device *device_list, const size_t *lengths, const unsigned char **binaries, + size_t num_metadata_entries, const pi_device_binary_property *metadata, pi_int32 *binary_status, pi_program *ret_program) { return PI_SUCCESS; } diff --git a/sycl/unittests/pi/cuda/test_kernels.cpp b/sycl/unittests/pi/cuda/test_kernels.cpp index d43cd6a7cd8bd..c4bdc3230de48 100644 --- a/sycl/unittests/pi/cuda/test_kernels.cpp +++ b/sycl/unittests/pi/cuda/test_kernels.cpp @@ -139,11 +139,57 @@ TEST_F(CudaKernelsTest, PICreateProgramAndKernel) { pi_program prog; pi_int32 binary_status = PI_SUCCESS; - ASSERT_EQ((plugin->call_nocheck( - context_, 1, &device_, nullptr, - (const unsigned char **)&ptxSource, &binary_status, &prog)), + ASSERT_EQ( + (plugin->call_nocheck( + context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0, + nullptr, &binary_status, &prog)), + PI_SUCCESS); + + ASSERT_EQ((plugin->call_nocheck( + prog, 1, &device_, "", nullptr, nullptr)), PI_SUCCESS); + pi_kernel kern; + ASSERT_EQ((plugin->call_nocheck( + prog, "_Z8myKernelPi", &kern)), + PI_SUCCESS); + ASSERT_NE(kern, nullptr); +} + +TEST_F(CudaKernelsTest, PICreateProgramAndKernelWithMetadata) { + + std::vector reqdWorkGroupSizeMD; + reqdWorkGroupSizeMD.reserve(5); + // 64-bit representing bit size + reqdWorkGroupSizeMD.push_back(96); + reqdWorkGroupSizeMD.push_back(0); + // reqd_work_group_size x + reqdWorkGroupSizeMD.push_back(8); + // reqd_work_group_size y + reqdWorkGroupSizeMD.push_back(16); + // reqd_work_group_size z + reqdWorkGroupSizeMD.push_back(32); + + const char *reqdWorkGroupSizeMDConstName = + "_Z8myKernelPi@reqd_work_group_size"; + std::vector reqdWorkGroupSizeMDName( + reqdWorkGroupSizeMDConstName, + reqdWorkGroupSizeMDConstName + strlen(reqdWorkGroupSizeMDConstName) + 1); + _pi_device_binary_property_struct reqdWorkGroupSizeMDProp = { + reqdWorkGroupSizeMDName.data(), reqdWorkGroupSizeMD.data(), + pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY, + sizeof(std::uint64_t) + sizeof(std::uint32_t) * 3}; + pi_device_binary_property reqdWorkGroupSizeMDPropPointer = + &reqdWorkGroupSizeMDProp; + + pi_program prog; + pi_int32 binary_status = PI_SUCCESS; + ASSERT_EQ( + (plugin->call_nocheck( + context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 1, + &reqdWorkGroupSizeMDPropPointer, &binary_status, &prog)), + PI_SUCCESS); + ASSERT_EQ((plugin->call_nocheck( prog, 1, &device_, "", nullptr, nullptr)), PI_SUCCESS); @@ -153,6 +199,15 @@ TEST_F(CudaKernelsTest, PICreateProgramAndKernel) { prog, "_Z8myKernelPi", &kern)), PI_SUCCESS); ASSERT_NE(kern, nullptr); + + size_t compileWGSize[3] = {0}; + ASSERT_EQ((plugin->call_nocheck( + kern, device_, PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(size_t) * 3, compileWGSize, nullptr)), + PI_SUCCESS); + for (int i = 0; i < 3; ++i) { + ASSERT_EQ(compileWGSize[i], reqdWorkGroupSizeMD[i + 2]); + } } TEST_F(CudaKernelsTest, PIKernelArgumentSimple) { @@ -162,10 +217,11 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSimple) { /// use it at some point in the future, pass it anyway and check the result. /// Same goes for all the other tests in this file. pi_int32 binary_status = PI_SUCCESS; - ASSERT_EQ((plugin->call_nocheck( - context_, 1, &device_, nullptr, - (const unsigned char **)&ptxSource, &binary_status, &prog)), - PI_SUCCESS); + ASSERT_EQ( + (plugin->call_nocheck( + context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0, + nullptr, &binary_status, &prog)), + PI_SUCCESS); ASSERT_EQ(binary_status, PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( @@ -191,10 +247,11 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwice) { pi_program prog; pi_int32 binary_status = PI_SUCCESS; - ASSERT_EQ((plugin->call_nocheck( - context_, 1, &device_, nullptr, - (const unsigned char **)&ptxSource, &binary_status, &prog)), - PI_SUCCESS); + ASSERT_EQ( + (plugin->call_nocheck( + context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0, + nullptr, &binary_status, &prog)), + PI_SUCCESS); ASSERT_EQ(binary_status, PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( @@ -229,10 +286,11 @@ TEST_F(CudaKernelsTest, PIKernelSetMemObj) { pi_program prog; pi_int32 binary_status = PI_SUCCESS; - ASSERT_EQ((plugin->call_nocheck( - context_, 1, &device_, nullptr, - (const unsigned char **)&ptxSource, &binary_status, &prog)), - PI_SUCCESS); + ASSERT_EQ( + (plugin->call_nocheck( + context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0, + nullptr, &binary_status, &prog)), + PI_SUCCESS); ASSERT_EQ(binary_status, PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( @@ -264,10 +322,11 @@ TEST_F(CudaKernelsTest, PIkerneldispatch) { pi_program prog; pi_int32 binary_status = PI_SUCCESS; - ASSERT_EQ((plugin->call_nocheck( - context_, 1, &device_, nullptr, - (const unsigned char **)&ptxSource, &binary_status, &prog)), - PI_SUCCESS); + ASSERT_EQ( + (plugin->call_nocheck( + context_, 1, &device_, nullptr, (const unsigned char **)&ptxSource, 0, + nullptr, &binary_status, &prog)), + PI_SUCCESS); ASSERT_EQ(binary_status, PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( @@ -307,10 +366,11 @@ TEST_F(CudaKernelsTest, PIkerneldispatchTwo) { pi_program prog; pi_int32 binary_status = PI_SUCCESS; - ASSERT_EQ((plugin->call_nocheck( - context_, 1, &device_, nullptr, - (const unsigned char **)&twoParams, &binary_status, &prog)), - PI_SUCCESS); + ASSERT_EQ( + (plugin->call_nocheck( + context_, 1, &device_, nullptr, (const unsigned char **)&twoParams, 0, + nullptr, &binary_status, &prog)), + PI_SUCCESS); ASSERT_EQ(binary_status, PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck( @@ -362,11 +422,11 @@ TEST_F(CudaKernelsTest, PIKernelArgumentSetTwiceOneLocal) { pi_program prog; pi_int32 binary_status = PI_SUCCESS; - ASSERT_EQ( - (plugin->call_nocheck( - context_, 1, &device_, nullptr, - (const unsigned char **)&threeParamsTwoLocal, &binary_status, &prog)), - PI_SUCCESS); + ASSERT_EQ((plugin->call_nocheck( + context_, 1, &device_, nullptr, + (const unsigned char **)&threeParamsTwoLocal, 0, nullptr, + &binary_status, &prog)), + PI_SUCCESS); ASSERT_EQ(binary_status, PI_SUCCESS); ASSERT_EQ((plugin->call_nocheck(