diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 4a6253f4eec87..35b59aafe818c 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -177,6 +177,7 @@ CODEGENOPT(NoImplicitFloat , 1, 0) ///< Set when -mno-implicit-float is enable CODEGENOPT(NullPointerIsValid , 1, 0) ///< Assume Null pointer deference is defined. CODEGENOPT(OpenCLCorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt CODEGENOPT(HIPCorrectlyRoundedDivSqrt, 1, 1) ///< -fno-hip-fp32-correctly-rounded-divide-sqrt +CODEGENOPT(SYCLFp32PrecSqrt, 1, 0) ///< -fsycl-fp32-prec-sqrt CODEGENOPT(UniqueInternalLinkageNames, 1, 0) ///< Internal Linkage symbols get unique names. CODEGENOPT(SplitMachineFunctions, 1, 0) ///< Split machine functions using profile information. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 6ef29fbf833be..d0c5827eb9c2e 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4732,6 +4732,9 @@ def fno_sycl_device_lib_EQ : CommaJoined<["-"], "fno-sycl-device-lib=">, Group, HelpText<"Control exclusion of " "device libraries from device binary linkage. Valid arguments " "are libc, libm-fp32, libm-fp64, all">; +def fsycl_fp32_prec_sqrt : Flag<["-"], "fsycl-fp32-prec-sqrt">, Group, Flags<[CC1Option]>, + HelpText<"SYCL only. Specify that single precision floating-point sqrt is correctly rounded.">, + MarshallingInfoFlag>; //===----------------------------------------------------------------------===// // FLangOption + CoreOption + NoXarchOption diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h index abb986a42f221..cdd60b4eee871 100644 --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -705,7 +705,8 @@ class ToolChain { /// Get paths of HIP device libraries. virtual llvm::SmallVector - getHIPDeviceLibs(const llvm::opt::ArgList &Args) const; + getHIPDeviceLibs(const llvm::opt::ArgList &Args, + const Action::OffloadKind DeviceOffloadingKind) const; /// Return sanitizers which are available in this toolchain. virtual SanitizerMask getSupportedSanitizers() const; diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 5ce9f942efd27..106d87d3dddfe 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1097,7 +1097,9 @@ void ToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const {} llvm::SmallVector -ToolChain::getHIPDeviceLibs(const ArgList &DriverArgs) const { +ToolChain::getHIPDeviceLibs( + const ArgList &DriverArgs, + const Action::OffloadKind DeviceOffloadingKind) const { return {}; } diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 43ce33750ebac..96a7c23c1778b 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -895,9 +895,9 @@ bool AMDGPUToolChain::shouldSkipArgument(const llvm::opt::Arg *A) const { return false; } -llvm::SmallVector -ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs, - const std::string &GPUArch) const { +llvm::SmallVector ROCMToolChain::getCommonDeviceLibNames( + const llvm::opt::ArgList &DriverArgs, const std::string &GPUArch, + const Action::OffloadKind DeviceOffloadingKind) const { auto Kind = llvm::AMDGPU::parseArchAMDGCN(GPUArch); const StringRef CanonArch = llvm::AMDGPU::getArchNameAMDGCN(Kind); @@ -920,9 +920,15 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs, options::OPT_fno_unsafe_math_optimizations, false); bool FastRelaxedMath = DriverArgs.hasFlag(options::OPT_ffast_math, options::OPT_fno_fast_math, false); - bool CorrectSqrt = DriverArgs.hasFlag( - options::OPT_fhip_fp32_correctly_rounded_divide_sqrt, - options::OPT_fno_hip_fp32_correctly_rounded_divide_sqrt); + bool CorrectSqrt = false; + if (DeviceOffloadingKind == Action::OFK_SYCL) { + // When using SYCL, sqrt is only correctly rounded if the flag is specified + CorrectSqrt = DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt); + } else + CorrectSqrt = DriverArgs.hasFlag( + options::OPT_fhip_fp32_correctly_rounded_divide_sqrt, + options::OPT_fno_hip_fp32_correctly_rounded_divide_sqrt); + bool Wave64 = isWave64(DriverArgs, Kind); return RocmInstallation.getCommonBitcodeLibs( diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index 156bfd1fbdb2a..c459f4629d343 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -142,7 +142,8 @@ class LLVM_LIBRARY_VISIBILITY ROCMToolChain : public AMDGPUToolChain { // Returns a list of device library names shared by different languages llvm::SmallVector getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs, - const std::string &GPUArch) const; + const std::string &GPUArch, + const Action::OffloadKind DeviceOffloadingKind) const; }; } // end namespace toolchains diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index f282f04b79311..07d2f60866260 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -123,7 +123,8 @@ const char *AMDGCN::OpenMPLinker::constructLLVMLinkCommand( // - write an opt pass that sets that on every function it sees and pipe // the device-libs bitcode through that on the way to this llvm-link SmallVector BCLibs = - AMDGPUOpenMPTC.getCommonDeviceLibNames(Args, SubArchName.str()); + AMDGPUOpenMPTC.getCommonDeviceLibNames(Args, SubArchName.str(), + Action::OFK_OpenMP); llvm::for_each(BCLibs, [&](StringRef BCFile) { CmdArgs.push_back(Args.MakeArgString(BCFile)); }); diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 0058def1aacfa..d1c3d575b598c 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -707,6 +707,10 @@ void CudaToolChain::addClangTargetOptions( if (DeviceOffloadingKind == Action::OFK_SYCL) { toolchains::SYCLToolChain::AddSYCLIncludeArgs(getDriver(), DriverArgs, CC1Args); + + if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt)) { + CC1Args.push_back("-fcuda-prec-sqrt"); + } } auto NoLibSpirv = DriverArgs.hasArg(options::OPT_fno_sycl_libspirv, diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 6bf5b1a220be1..fff8d455568e6 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -256,11 +256,12 @@ void HIPAMDToolChain::addClangTargetOptions( CC1Args.push_back(DriverArgs.MakeArgString(LibSpirvFile)); } - llvm::for_each(getHIPDeviceLibs(DriverArgs), [&](auto BCFile) { - CC1Args.push_back(BCFile.ShouldInternalize ? "-mlink-builtin-bitcode" - : "-mlink-bitcode-file"); - CC1Args.push_back(DriverArgs.MakeArgString(BCFile.Path)); - }); + llvm::for_each( + getHIPDeviceLibs(DriverArgs, DeviceOffloadingKind), [&](auto BCFile) { + CC1Args.push_back(BCFile.ShouldInternalize ? "-mlink-builtin-bitcode" + : "-mlink-bitcode-file"); + CC1Args.push_back(DriverArgs.MakeArgString(BCFile.Path)); + }); } llvm::opt::DerivedArgList * @@ -355,7 +356,9 @@ VersionTuple HIPAMDToolChain::computeMSVCVersion(const Driver *D, } llvm::SmallVector -HIPAMDToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const { +HIPAMDToolChain::getHIPDeviceLibs( + const llvm::opt::ArgList &DriverArgs, + const Action::OffloadKind DeviceOffloadingKind) const { llvm::SmallVector BCLibs; if (DriverArgs.hasArg(options::OPT_nogpulib)) return {}; @@ -412,7 +415,8 @@ HIPAMDToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const { BCLibs.push_back(RocmInstallation.getHIPPath()); // Add common device libraries like ocml etc. - for (auto N : getCommonDeviceLibNames(DriverArgs, GpuArch.str())) + for (auto N : getCommonDeviceLibNames(DriverArgs, GpuArch.str(), + DeviceOffloadingKind)) BCLibs.push_back(StringRef(N)); // Add instrument lib. diff --git a/clang/lib/Driver/ToolChains/HIPAMD.h b/clang/lib/Driver/ToolChains/HIPAMD.h index e4a2f74796484..3b2c2383857a3 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.h +++ b/clang/lib/Driver/ToolChains/HIPAMD.h @@ -86,8 +86,9 @@ class LLVM_LIBRARY_VISIBILITY HIPAMDToolChain final : public ROCMToolChain { llvm::opt::ArgStringList &CC1Args) const override; void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; - llvm::SmallVector - getHIPDeviceLibs(const llvm::opt::ArgList &Args) const override; + llvm::SmallVector getHIPDeviceLibs( + const llvm::opt::ArgList &Args, + const Action::OffloadKind DeviceOffloadingKind) const override; SanitizerMask getSupportedSanitizers() const override; diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp index d68c87e9b3e71..e6717e244545c 100644 --- a/clang/lib/Driver/ToolChains/HIPSPV.cpp +++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp @@ -154,7 +154,7 @@ void HIPSPVToolChain::addClangTargetOptions( CC1Args.append( {"-fvisibility", "hidden", "-fapply-global-visibility-to-externs"}); - llvm::for_each(getHIPDeviceLibs(DriverArgs), + llvm::for_each(getHIPDeviceLibs(DriverArgs, DeviceOffloadingKind), [&](const BitCodeLibraryInfo &BCFile) { CC1Args.append({"-mlink-builtin-bitcode", DriverArgs.MakeArgString(BCFile.Path)}); @@ -206,7 +206,9 @@ void HIPSPVToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs, } llvm::SmallVector -HIPSPVToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const { +HIPSPVToolChain::getHIPDeviceLibs( + const llvm::opt::ArgList &DriverArgs, + const Action::OffloadKind DeviceOffloadingKind) const { llvm::SmallVector BCLibs; if (DriverArgs.hasArg(options::OPT_nogpulib)) return {}; diff --git a/clang/lib/Driver/ToolChains/HIPSPV.h b/clang/lib/Driver/ToolChains/HIPSPV.h index 79520f77c742f..036f09e5872f7 100644 --- a/clang/lib/Driver/ToolChains/HIPSPV.h +++ b/clang/lib/Driver/ToolChains/HIPSPV.h @@ -68,8 +68,9 @@ class LLVM_LIBRARY_VISIBILITY HIPSPVToolChain final : public ToolChain { llvm::opt::ArgStringList &CC1Args) const override; void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; - llvm::SmallVector - getHIPDeviceLibs(const llvm::opt::ArgList &Args) const override; + llvm::SmallVector getHIPDeviceLibs( + const llvm::opt::ArgList &Args, + const Action::OffloadKind DeviceOffloadingKind) const override; SanitizerMask getSupportedSanitizers() const override; diff --git a/clang/test/Driver/sycl-amdgcn-sqrt.cpp b/clang/test/Driver/sycl-amdgcn-sqrt.cpp new file mode 100644 index 0000000000000..1c0ee7077dd69 --- /dev/null +++ b/clang/test/Driver/sycl-amdgcn-sqrt.cpp @@ -0,0 +1,35 @@ +// REQUIRES: clang-driver +// REQUIRES: amdgpu-registered-target +// REQUIRES: !system-windows + +// RUN: %clang -### \ +// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa \ +// RUN: -Xsycl-target-backend --offload-arch=gfx900 \ +// RUN: -fsycl-fp32-prec-sqrt \ +// RUN: --rocm-path=%S/Inputs/rocm \ +// RUN: %s \ +// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s + +// CHECK-CORRECT: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc" + +// RUN: %clang -### \ +// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa \ +// RUN: -Xsycl-target-backend --offload-arch=gfx900 \ +// RUN: --rocm-path=%S/Inputs/rocm \ +// RUN: %s \ +// RUN: 2>&1 | FileCheck --check-prefix=CHECK-APPROX %s + +// CHECK-APPROX: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc" + +// RUN: %clang -### \ +// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa \ +// RUN: -Xsycl-target-backend --offload-arch=gfx900 \ +// RUN: -fsycl-fp32-prec-sqrt -fno-hip-fp32-correctly-rounded-divide-sqrt \ +// RUN: --rocm-path=%S/Inputs/rocm \ +// RUN: %s \ +// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CONFLICT %s + +// CHECK-CONFLICT: warning: argument unused during compilation: '-fno-hip-fp32-correctly-rounded-divide-sqrt' +// CHECK-CONFLICT: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc" + +void func(){}; diff --git a/clang/test/Driver/sycl-no-prec-sqrt.cpp b/clang/test/Driver/sycl-no-prec-sqrt.cpp new file mode 100644 index 0000000000000..a1100be284949 --- /dev/null +++ b/clang/test/Driver/sycl-no-prec-sqrt.cpp @@ -0,0 +1,17 @@ +// REQUIRES: clang-driver + +// RUN: %clang -### -fsycl \ +// RUN: -fsycl-fp32-prec-sqrt %s 2>&1 | FileCheck %s + +// RUN: %clang -### -fsycl -fsycl-targets=spir64_gen \ +// RUN: -fsycl-fp32-prec-sqrt %s 2>&1 | FileCheck %s +// +// RUN: %clang -### -fsycl -fsycl-targets=spir64_x86_64 \ +// RUN: -fsycl-fp32-prec-sqrt %s 2>&1 | FileCheck %s +// +// RUN: %clang -### -fsycl -fsycl-targets=spir64_fpga \ +// RUN: -fsycl-fp32-prec-sqrt %s 2>&1 | FileCheck %s + +// CHECK: warning: argument unused during compilation: '-fsycl-fp32-prec-sqrt' + +void func(){}; diff --git a/clang/test/Driver/sycl-nvptx-sqrt.cpp b/clang/test/Driver/sycl-nvptx-sqrt.cpp new file mode 100644 index 0000000000000..d5320b8c0e21e --- /dev/null +++ b/clang/test/Driver/sycl-nvptx-sqrt.cpp @@ -0,0 +1,19 @@ +// REQUIRES: clang-driver +// REQUIRES: nvptx-registered-target + +// RUN: %clang -### \ +// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \ +// RUN: -fsycl-fp32-prec-sqrt \ +// RUN: %s \ +// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s + +// CHECK-CORRECT: "-fcuda-prec-sqrt" + +// RUN: %clang -### \ +// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \ +// RUN: %s \ +// RUN: 2>&1 | FileCheck --check-prefix=CHECK-APPROX %s + +// CHECK-APPROX-NOT: "-fcuda-prec-sqrt" + +void func(){}; diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 026411d2aed50..5e6f9042851a6 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -832,7 +832,7 @@ which contains all the symbols required. project and may cause compilation issues on some platforms * `sycl::sqrt` is not correctly rounded by default as the SYCL specification allows lower precision, when porting from CUDA it may be helpful to use - `-Xclang -fcuda-prec-sqrt` to use the correctly rounded square root, this is + `-fsycl-fp32-prec-sqrt` to use the correctly rounded square root, this is significantly slower but matches the default precision used by `nvcc`, and this `clang++` flag is equivalent to the `nvcc` `-prec-sqrt` flag, except that it defaults to `false`. diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md index 0e39e271471d5..046475cbcc571 100644 --- a/sycl/doc/UsersManual.md +++ b/sycl/doc/UsersManual.md @@ -257,6 +257,14 @@ and not recommended to use in production environment. options (e.g. -c, -E, -S) may interfere with the expected output set during the host compilation. Doing so is considered undefined behavior. +**`-fsycl-fp32-prec-sqrt`** + + Enable use of correctly rounded `sycl::sqrt` function as defined by IEE754. + Without this flag, the default precision requirement for `sycl::sqrt` is 3 + ULP. + + NOTE: This flag is currently only supported with the CUDA and HIP targets. + # Example: SYCL device code compilation To invoke SYCL device compiler set `-fsycl-device-only` flag.