From d45acd80c00efc17b4ea80923bc617f35e21d8d1 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 10 Oct 2019 17:18:39 -0700 Subject: [PATCH] [SYCL] Enable useful (not random) output from stream Pool of flush buffers is allocated in local memory. This pool contains space for each work item in the work group. Each work item writes to its own space (flush buffer), as a result output from different work items is not mixed. Data is flushed to global buffer on endl, flush or when kernel execution is finished. Global buffer contains all output from the kernel. Offset of the WI's flush buffer in the pool is calculated only once in __init method. Call to this method is generated by frontend. In the current implementation user should explicitly flush data on the host device. Data is not flushed automatically after kernel execution because of the missing feature in the scheduler. Signed-off-by: Artur Gainullin --- clang/lib/Sema/SemaSYCL.cpp | 19 +- sycl/include/CL/sycl/accessor.hpp | 24 +-- sycl/include/CL/sycl/detail/accessor_impl.hpp | 17 ++ sycl/include/CL/sycl/detail/stream_impl.hpp | 163 +++++++++++------- sycl/include/CL/sycl/handler.hpp | 10 ++ sycl/include/CL/sycl/stream.hpp | 130 +++++++++++--- sycl/source/detail/stream_impl.cpp | 5 +- sycl/source/stream.cpp | 24 ++- sycl/test/basic_tests/stream/auto_flush.cpp | 34 ++++ sycl/test/basic_tests/{ => stream}/stream.cpp | 12 +- sycl/test/linear_id/linear-host-dev.cpp | 2 +- 11 files changed, 328 insertions(+), 112 deletions(-) create mode 100644 sycl/test/basic_tests/stream/auto_flush.cpp rename sycl/test/basic_tests/{ => stream}/stream.cpp (96%) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index e440e21449e8..52ccdc1a7e81 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -63,6 +63,10 @@ class Util { /// sampler class. static bool isSyclSamplerType(const QualType &Ty); + /// Checks whether given clang type is a full specialization of the SYCL + /// stream class. + static bool isSyclStreamType(const QualType &Ty); + /// Checks whether given clang type is a standard SYCL API class with given /// name. /// \param Ty the clang type being checked @@ -770,7 +774,7 @@ static CompoundStmt *CreateOpenCLKernelBody(Sema &S, // All special SYCL objects must have __init method CXXMethodDecl *InitMethod = getInitMethod(CRD); assert(InitMethod && - "The accessor/sampler must have the __init method"); + "The accessor/sampler/stream must have the __init method"); unsigned NumParams = InitMethod->getNumParams(); llvm::SmallVector ParamDREs(NumParams); auto KFP = KernelFuncParam; @@ -780,7 +784,9 @@ static CompoundStmt *CreateOpenCLKernelBody(Sema &S, S.Context, NestedNameSpecifierLoc(), SourceLocation(), *KFP, false, DeclarationNameInfo(), ParamType, VK_LValue); } - std::advance(KernelFuncParam, NumParams - 1); + + if (NumParams) + std::advance(KernelFuncParam, NumParams - 1); DeclAccessPair FieldDAP = DeclAccessPair::make(Field, AS_none); // [kernel_obj or wrapper object].special_obj @@ -909,6 +915,11 @@ static CompoundStmt *CreateOpenCLKernelBody(Sema &S, DeclarationNameInfo(Field->getDeclName(), SourceLocation()), nullptr, Field->getType(), VK_LValue, OK_Ordinary, NOUR_None); getExprForWrappedAccessorInit(CRD, Lhs); + if (Util::isSyclStreamType(FieldType)) { + // Generate call to the __init method of the stream class after + // initializing accessors wrapped by this stream object + getExprForSpecialSYCLObj(FieldType, Field, CRD, KernelObjCloneRef); + } } } else { llvm_unreachable("Unsupported field type"); @@ -1714,6 +1725,10 @@ bool Util::isSyclSamplerType(const QualType &Ty) { return isSyclType(Ty, "sampler"); } +bool Util::isSyclStreamType(const QualType &Ty) { + return isSyclType(Ty, "stream"); +} + bool Util::isSyclType(const QualType &Ty, StringRef Name, bool Tmpl) { Decl::Kind ClassDeclKind = Tmpl ? Decl::Kind::ClassTemplateSpecialization : Decl::Kind::CXXRecord; diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index 1672a4ffe26d..043979ac03f7 100644 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -1111,26 +1111,26 @@ class accessor> - operator atomic() const { + template + operator typename detail::enable_if_t< + Dims == 0 && AccessMode == access::mode::atomic, atomic>() + const { return atomic(multi_ptr(getQualifiedPtr())); } - template 0) && - AccessMode == access::mode::atomic>> - atomic operator[](id Index) const { + template + typename detail::enable_if_t<(Dims > 0) && AccessMode == access::mode::atomic, + atomic> + operator[](id Index) const { const size_t LinearIndex = getLinearIndex(Index); return atomic( multi_ptr(getQualifiedPtr() + LinearIndex)); } - template > - atomic operator[](size_t Index) const { + template + typename detail::enable_if_t> + operator[](size_t Index) const { return atomic(multi_ptr(getQualifiedPtr() + Index)); } diff --git a/sycl/include/CL/sycl/detail/accessor_impl.hpp b/sycl/include/CL/sycl/detail/accessor_impl.hpp index 131b353cea08..96efd332118b 100644 --- a/sycl/include/CL/sycl/detail/accessor_impl.hpp +++ b/sycl/include/CL/sycl/detail/accessor_impl.hpp @@ -141,6 +141,23 @@ class LocalAccessorImplHost { int MDims; int MElemSize; std::vector MMem; + + bool PerWI = false; + size_t LocalMemSize; + size_t MaxWGSize; + void resize(size_t LocalSize, size_t GlobalSize) { + if (GlobalSize != 1 && LocalSize != 1) { + // If local size is not specified then work group size is chosen by + // runtime. That is why try to allocate based on max work group size or + // global size. In the worst case allocate 80% of local memory. + size_t MinEstWGSize = LocalSize ? LocalSize : GlobalSize; + MinEstWGSize = MinEstWGSize > MaxWGSize ? MaxWGSize : MinEstWGSize; + size_t NewSize = MinEstWGSize * MSize[0]; + MSize[0] = + NewSize > 8 * LocalMemSize / 10 ? 8 * LocalMemSize / 10 : NewSize; + MMem.resize(NewSize * MElemSize); + } + } }; class LocalAccessorBaseHost { diff --git a/sycl/include/CL/sycl/detail/stream_impl.hpp b/sycl/include/CL/sycl/detail/stream_impl.hpp index f1c5accf533b..305f2d2298d1 100644 --- a/sycl/include/CL/sycl/detail/stream_impl.hpp +++ b/sycl/include/CL/sycl/detail/stream_impl.hpp @@ -19,6 +19,7 @@ namespace cl { namespace sycl { namespace detail { + using FmtFlags = unsigned int; // Mapping from stream_manipulator to FmtFlags. Each manipulator corresponds @@ -78,31 +79,49 @@ using EnableIfSwizzleVec = class stream_impl { public: - using AccessorType = accessor; + using GlobalBufAccessorT = + accessor; - using OffsetAccessorType = + using GlobalOffsetAccessorT = accessor; + using FlushBufAccessorT = + accessor; + + using LocalOffsetAccessorT = + accessor; + stream_impl(size_t BufferSize, size_t MaxStatementSize, handler &CGH); - // Method to provide an access to the stream buffer - AccessorType getAccessor(handler &CGH) { + // Method to provide an access to the global stream buffer + GlobalBufAccessorT accessGlobalBuf(handler &CGH) { return Buf.get_access( CGH, range<1>(BufferSize_), id<1>(OffsetSize)); } - // Method to provide an atomic access to the offset in the stream buffer - OffsetAccessorType getOffsetAccessor(handler &CGH) { + // Method to provide an atomic access to the offset in the global stream + // buffer + GlobalOffsetAccessorT accessGlobalOffset(handler &CGH) { auto OffsetSubBuf = buffer(Buf, id<1>(0), range<1>(OffsetSize)); auto ReinterpretedBuf = OffsetSubBuf.reinterpret(range<1>(1)); return ReinterpretedBuf.get_access( CGH, range<1>(1), id<1>(0)); } + // Method to provide an atomic access to the flush buffer size + GlobalOffsetAccessorT accessFlushBufferSize(handler &CGH) { + return FlushBufferSize.get_access( + CGH, range<1>(1), id<1>(0)); + } + // Copy stream buffer to the host and print the contents void flush(); @@ -116,7 +135,7 @@ class stream_impl { // Maximum number of symbols which could be streamed from the beginning of a // statement till the semicolon - size_t MaxStatementSize_; + unsigned MaxStatementSize_; // Size of the variable which is used as an offset in the stream buffer. // Additinonal memory is allocated in the beginning of the stream buffer for @@ -128,6 +147,9 @@ class stream_impl { // Stream buffer buffer Buf; + + // Buffer for flush buffer size + buffer FlushBufferSize; }; template @@ -268,38 +290,55 @@ EnableIfFP floatingPointToDecStr(T AbsVal, char *Digits, return Offset; } -// Helper method to update offset atomically according to the provided -// operand size of the output operator. Return true if offset is updated and -// false in case of overflow. -inline bool updateOffset(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Size, - unsigned &Cur) { +// Helper method to update offset in the global buffer atomically according to +// the provided size of the data in the flush buffer. Return true if offset is +// updated and false in case of overflow. +inline bool updateOffset(stream_impl::GlobalOffsetAccessorT &GlobalOffset, + stream_impl::GlobalBufAccessorT &GlobalBuf, + unsigned Size, unsigned &Cur) { unsigned New; + Cur = GlobalOffset[0].load(); do { - Cur = OffsetAcc[0].load(); - if (Acc.get_count() - Cur < Size) + if (GlobalBuf.get_range().size() - Cur < Size) // Overflow return false; New = Cur + Size; - } while (!OffsetAcc[0].compare_exchange_strong(Cur, New)); + } while (!GlobalOffset[0].compare_exchange_strong(Cur, New)); return true; } -inline void write(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Len, const char *Buf, - unsigned Padding = 0) { +inline void flushBuffer(stream_impl::GlobalOffsetAccessorT &GlobalOffset, + stream_impl::GlobalBufAccessorT &GlobalBuf, + stream_impl::FlushBufAccessorT &FlushBufs, + unsigned &WIOffset, unsigned &Offset) { + // Copy data from flush buffer (local memory) to global buffer (global + // memory) unsigned Cur = 0; - if (!updateOffset(OffsetAcc, Acc, Len + Padding, Cur)) + if (!updateOffset(GlobalOffset, GlobalBuf, Offset, Cur)) return; - size_t I = 0; + for (unsigned I = WIOffset; I < WIOffset + Offset; I++) { + GlobalBuf[Cur++] = FlushBufs[I]; + } + // Reset the offset in the flush buffer + Offset = 0; +} + +inline void write(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset, + const char *Str, unsigned Len, unsigned Padding = 0) { + if ((FlushBufferSize - Offset < Len + Padding) || + (WIOffset + Offset + Len + Padding > FlushBufs.get_count())) + // TODO: flush here + return; // Write padding - for (; I < Padding; ++I) - Acc[I + Cur] = ' '; + for (size_t I = 0; I < Padding; ++I, ++Offset) + FlushBufs[WIOffset + Offset] = ' '; - for (; I < Len; I++) - Acc[I + Cur] = Buf[I]; + for (size_t I = 0; I < Len; ++I, ++Offset) { + FlushBufs[WIOffset + Offset] = Str[I]; + } } inline void reverseBuf(char *Buf, unsigned Len) { @@ -437,12 +476,12 @@ ScalarToStr(const T &Val, char *Buf, unsigned Flags, int Width, template inline typename std::enable_if::value>::type -writeIntegral(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Flags, int Width, +writeIntegral(stream_impl::FlushBufAccessorT &FlushBufs, size_t FlushBufferSize, + unsigned WIOffset, unsigned &Offset, unsigned Flags, int Width, const T &Val) { char Digits[MAX_INTEGRAL_DIGITS] = {0}; unsigned Len = ScalarToStr(Val, Digits, Flags, Width); - write(OffsetAcc, Acc, Len, Digits, + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Digits, Len, (Width > 0 && static_cast(Width) > Len) ? static_cast(Width) - Len : 0); @@ -450,12 +489,12 @@ writeIntegral(stream_impl::OffsetAccessorType &OffsetAcc, template inline EnableIfFP -writeFloatingPoint(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Flags, int Width, - int Precision, const T &Val) { +writeFloatingPoint(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset, + unsigned Flags, int Width, int Precision, const T &Val) { char Digits[MAX_FLOATING_POINT_DIGITS] = {0}; unsigned Len = ScalarToStr(Val, Digits, Flags, Width, Precision); - write(OffsetAcc, Acc, Len, Digits, + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Digits, Len, (Width > 0 && static_cast(Width) > Len) ? static_cast(Width) - Len : 0); @@ -493,15 +532,16 @@ VecToStr(const vec &Vec, char *VecStr, unsigned Flags, int Width, } template -inline void writeVec(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Flags, int Width, - int Precision, const vec &Vec) { +inline void writeVec(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, unsigned Flags, int Width, int Precision, + const vec &Vec) { // Reserve space for vector elements and delimiters constexpr size_t MAX_VEC_SIZE = MAX_FLOATING_POINT_DIGITS * VecLength + (VecLength - 1) * 2; char VecStr[MAX_VEC_SIZE] = {0}; unsigned Len = VecToStr(Vec, VecStr, Flags, Width, Precision); - write(OffsetAcc, Acc, Len, VecStr, + write(FlushBufs, FlushBufferSize, WIOffset, Offset, VecStr, Len, (Width > 0 && Width > Len) ? Width - Len : 0); } @@ -522,18 +562,18 @@ inline unsigned ArrayToStr(char *Buf, const array &Arr) { } template -inline void writeArray(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const array &Arr) { +inline void writeArray(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const array &Arr) { char Buf[MAX_ARRAY_SIZE]; unsigned Len = ArrayToStr(Buf, Arr); - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } template -inline void writeItem(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const item &Item) { +inline void writeItem(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const item &Item) { // Reserve space for 3 arrays and additional place (40 symbols) for printing // the text char Buf[3 * MAX_ARRAY_SIZE + 40]; @@ -546,12 +586,13 @@ inline void writeItem(stream_impl::OffsetAccessorType &OffsetAcc, Len += append(Buf + Len, ", offset: "); Len += ArrayToStr(Buf + Len, Item.get_offset()); Buf[Len++] = ')'; - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } template -inline void writeNDRange(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, +inline void writeNDRange(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const nd_range &ND_Range) { // Reserve space for 3 arrays and additional place (50 symbols) for printing // the text @@ -565,13 +606,13 @@ inline void writeNDRange(stream_impl::OffsetAccessorType &OffsetAcc, Len += append(Buf + Len, ", offset: "); Len += ArrayToStr(Buf + Len, ND_Range.get_offset()); Buf[Len++] = ')'; - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } template -inline void writeNDItem(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const nd_item &ND_Item) { +inline void writeNDItem(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const nd_item &ND_Item) { // Reserve space for 2 arrays and additional place (40 symbols) for printing // the text char Buf[2 * MAX_ARRAY_SIZE + 40]; @@ -582,13 +623,13 @@ inline void writeNDItem(stream_impl::OffsetAccessorType &OffsetAcc, Len += append(Buf + Len, ", local_id: "); Len += ArrayToStr(Buf + Len, ND_Item.get_local_id()); Buf[Len++] = ')'; - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } template -inline void writeGroup(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const group &Group) { +inline void writeGroup(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const group &Group) { // Reserve space for 4 arrays and additional place (60 symbols) for printing // the text char Buf[4 * MAX_ARRAY_SIZE + 60]; @@ -603,7 +644,7 @@ inline void writeGroup(stream_impl::OffsetAccessorType &OffsetAcc, Len += append(Buf + Len, ", group_range: "); Len += ArrayToStr(Buf + Len, Group.get_group_range()); Buf[Len++] = ')'; - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } // Space for 2 arrays and additional place (20 symbols) for printing @@ -623,9 +664,9 @@ inline unsigned ItemToStr(char *Buf, const item &Item) { } template -inline void writeHItem(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const h_item &HItem) { +inline void writeHItem(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const h_item &HItem) { // Reserve space for 3 items and additional place (60 symbols) for printing // the text char Buf[3 * MAX_ITEM_SIZE + 60]; @@ -640,7 +681,7 @@ inline void writeHItem(stream_impl::OffsetAccessorType &OffsetAcc, : HItem.get_physical_local()); } Len += append(Buf + Len, "\n)"); - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } } // namespace detail diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 531dea83e437..3dd2bfea591a 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -286,6 +286,15 @@ class handler { case access::target::local: { detail::LocalAccessorBaseHost *LAcc = static_cast(Ptr); + // Stream implementation creates local accessor with size per work item + // in work group. Number of work items is not available during stream + // construction, that is why size of the accessor is updated here using + // information about number of work items in the work group. + if (detail::getSyclObjImpl(*LAcc)->PerWI) { + auto LocalAccImpl = detail::getSyclObjImpl(*LAcc); + LocalAccImpl->resize(MNDRDesc.LocalSize.size(), + MNDRDesc.GlobalSize.size()); + } range<3> &Size = LAcc->getSize(); const int Dims = LAcc->getNumOfDims(); int SizeInBytes = LAcc->getElementSize(); @@ -526,6 +535,7 @@ class handler { friend class detail::image_accessor; // Make stream class friend to be able to keep the list of associated streams friend class stream; + friend class detail::stream_impl; public: handler(const handler &) = delete; diff --git a/sycl/include/CL/sycl/stream.hpp b/sycl/include/CL/sycl/stream.hpp index 5401b0d55511..5c78eda50e6d 100644 --- a/sycl/include/CL/sycl/stream.hpp +++ b/sycl/include/CL/sycl/stream.hpp @@ -22,6 +22,7 @@ enum class stream_manipulator { noshowpos, showpos, endl, + flush, fixed, scientific, hexfloat, @@ -44,6 +45,8 @@ constexpr stream_manipulator showpos = stream_manipulator::showpos; constexpr stream_manipulator endl = stream_manipulator::endl; +constexpr stream_manipulator flush = stream_manipulator::flush; + constexpr stream_manipulator fixed = stream_manipulator::fixed; constexpr stream_manipulator scientific = stream_manipulator::scientific; @@ -104,6 +107,18 @@ class stream { bool operator!=(const stream &LHS) const; + ~stream() { + // Flush data to global buffer in stream destruction if flush buffer is not + // empty. This could be necessary if user hasn't flushed data himself and + // kernel execution is finished + // NOTE: In the current implementation user should explicitly flush data on + // the host device. Data is not flushed automatically after kernel execution + // because of the missing feature in scheduler. + if (Offset) { + flushBuffer(GlobalOffset, GlobalBuf, FlushBufs, WIOffset, Offset); + } + } + private: #ifdef __SYCL_DEVICE_ONLY__ char padding[sizeof(std::shared_ptr)]; @@ -113,15 +128,39 @@ class stream { friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject); #endif - // Accessor to stream buffer - mutable detail::stream_impl::AccessorType Acc; + // Accessor to the global stream buffer. Global buffer contains all output + // from the kernel. + mutable detail::stream_impl::GlobalBufAccessorT GlobalBuf; - // Atomic accessor to the offset variable. It represents an offset in the - // stream buffer. - mutable detail::stream_impl::OffsetAccessorType OffsetAcc; - mutable stream_manipulator Manipulator = defaultfloat; + // Atomic accessor to the global offset variable. It represents an offset in + // the global stream buffer. Since work items will flush data to global buffer + // in parallel we need atomic access to this offset. + mutable detail::stream_impl::GlobalOffsetAccessorT GlobalOffset; + + // Accessor to the pool of flush buffers. Flush buffer contains output from + // work item in the work group. One flush buffer per work item in the work + // group. + mutable detail::stream_impl::FlushBufAccessorT FlushBufs; + + // Each work item in the work group writes to its own flush buffer in the + // pool. This accessor is used to atomically get offset of the flush buffer in + // the pool for each work item in the work group. This approach is used + // because currently it is not possible to get work item id in the work group + // without id object, which is passed to the kernel. + mutable detail::stream_impl::LocalOffsetAccessorT WIOffsetAcc; + + mutable detail::stream_impl::GlobalOffsetAccessorT FlushSize; + + // Offset of the WI's flush buffer in the pool. + mutable unsigned WIOffset = 0; + + // Offset in the flush buffer + mutable unsigned Offset = 0; + + mutable size_t FlushBufferSize; // Fields and methods to work with manipulators + mutable stream_manipulator Manipulator = defaultfloat; // Type used for format flags using FmtFlags = unsigned int; @@ -186,6 +225,19 @@ class stream { } } +#ifdef __SYCL_DEVICE_ONLY__ + void __init() { + // Calculate work item id inside work group, this should be done once, that + // is why this is done in _init method, call to __init method is generated + // by frontend. As a result each work item will write to its own flush + // buffer. + FlushBufferSize = FlushSize[0].load(); + WIOffsetAcc[0].store(0); + detail::workGroupBarrier(); + WIOffset = WIOffsetAcc[0].fetch_add(FlushBufferSize); + } +#endif + friend const stream &operator<<(const stream &, const char); friend const stream &operator<<(const stream &, const char *); template @@ -195,7 +247,9 @@ class stream { friend const stream &operator<<(const stream &, const float &); friend const stream &operator<<(const stream &, const double &); friend const stream &operator<<(const stream &, const half &); + friend const stream &operator<<(const stream &, const stream_manipulator); + friend const stream &operator<<(const stream &Out, const __precision_manipulator__ &RHS); @@ -236,20 +290,22 @@ class stream { // Character inline const stream &operator<<(const stream &Out, const char C) { - unsigned Cur; - if (!detail::updateOffset(Out.OffsetAcc, Out.Acc, 1, Cur)) + if (Out.Offset >= Out.FlushBufferSize || + Out.WIOffset + Out.Offset + 1 > Out.FlushBufs.get_count()) return Out; - Out.Acc[Cur] = C; + Out.FlushBufs[Out.WIOffset + Out.Offset] = C; + ++Out.Offset; return Out; } // String inline const stream &operator<<(const stream &Out, const char *Str) { - unsigned Len; - for (Len = 0; Str[Len] != '\0'; Len++) + unsigned Len = 0; + for (; Str[Len] != '\0'; Len++) ; - detail::write(Out.OffsetAcc, Out.Acc, Len, Str); + detail::write(Out.FlushBufs, Out.FlushBufferSize, Out.WIOffset, Out.Offset, + Str, Len); return Out; } @@ -264,27 +320,30 @@ template inline typename std::enable_if::value, const stream &>::type operator<<(const stream &Out, const ValueType &RHS) { - detail::writeIntegral(Out.OffsetAcc, Out.Acc, Out.get_flags(), - Out.get_width(), RHS); + detail::writeIntegral(Out.FlushBufs, Out.FlushBufferSize, Out.WIOffset, + Out.Offset, Out.get_flags(), Out.get_width(), RHS); return Out; } // Floating points inline const stream &operator<<(const stream &Out, const float &RHS) { - detail::writeFloatingPoint(Out.OffsetAcc, Out.Acc, Out.get_flags(), + detail::writeFloatingPoint(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, Out.get_flags(), Out.get_width(), Out.get_precision(), RHS); return Out; } inline const stream &operator<<(const stream &Out, const double &RHS) { - detail::writeFloatingPoint(Out.OffsetAcc, Out.Acc, Out.get_flags(), + detail::writeFloatingPoint(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, Out.get_flags(), Out.get_width(), Out.get_precision(), RHS); return Out; } inline const stream &operator<<(const stream &Out, const half &RHS) { - detail::writeFloatingPoint(Out.OffsetAcc, Out.Acc, Out.get_flags(), + detail::writeFloatingPoint(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, Out.get_flags(), Out.get_width(), Out.get_precision(), RHS); return Out; } @@ -303,7 +362,8 @@ const stream &operator<<(const stream &Out, const T *RHS) { detail::FmtFlags Flags = Out.get_flags(); Flags &= ~detail::BaseField; Flags |= detail::Hex | detail::ShowBase; - detail::writeIntegral(Out.OffsetAcc, Out.Acc, Flags, Out.get_width(), + detail::writeIntegral(Out.FlushBufs, Out.FlushBufferSize, Out.WIOffset, + Out.Offset, Flags, Out.get_width(), reinterpret_cast(RHS)); return Out; } @@ -312,13 +372,13 @@ const stream &operator<<(const stream &Out, const T *RHS) { inline const stream &operator<<(const stream &Out, const __precision_manipulator__ &RHS) { - Out.Width = RHS.precision(); + Out.Precision = RHS.precision(); return Out; } inline const stream &operator<<(const stream &Out, const __width_manipulator__ &RHS) { - Out.Precision = RHS.width(); + Out.Width = RHS.width(); return Out; } @@ -327,6 +387,12 @@ inline const stream &operator<<(const stream &Out, switch (RHS) { case stream_manipulator::endl: Out << '\n'; + flushBuffer(Out.GlobalOffset, Out.GlobalBuf, Out.FlushBufs, Out.WIOffset, + Out.Offset); + break; + case stream_manipulator::flush: + flushBuffer(Out.GlobalOffset, Out.GlobalBuf, Out.FlushBufs, Out.WIOffset, + Out.Offset); break; default: Out.set_manipulator(RHS); @@ -339,7 +405,8 @@ inline const stream &operator<<(const stream &Out, template const stream &operator<<(const stream &Out, const vec &RHS) { - detail::writeVec(Out.OffsetAcc, Out.Acc, Out.get_flags(), + detail::writeVec(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, Out.get_flags(), Out.get_width(), Out.get_precision(), RHS); return Out; } @@ -348,49 +415,56 @@ const stream &operator<<(const stream &Out, const vec &RHS) { template inline const stream &operator<<(const stream &Out, const id &RHS) { - detail::writeArray(Out.OffsetAcc, Out.Acc, RHS); + detail::writeArray(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const range &RHS) { - detail::writeArray(Out.OffsetAcc, Out.Acc, RHS); + detail::writeArray(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const item &RHS) { - detail::writeItem(Out.OffsetAcc, Out.Acc, RHS); + detail::writeItem(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const nd_range &RHS) { - detail::writeNDRange(Out.OffsetAcc, Out.Acc, RHS); + detail::writeNDRange(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const nd_item &RHS) { - detail::writeNDItem(Out.OffsetAcc, Out.Acc, RHS); + detail::writeNDItem(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const group &RHS) { - detail::writeGroup(Out.OffsetAcc, Out.Acc, RHS); + detail::writeGroup(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const h_item &RHS) { - detail::writeHItem(Out.OffsetAcc, Out.Acc, RHS); + detail::writeHItem(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 1f8136410880..e66cd11ea6cb 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -23,7 +23,10 @@ stream_impl::stream_impl(size_t BufferSize, size_t MaxStatementSize, // 2. Offset is properly initialized. Data(BufferSize + OffsetSize + 1, 0), Buf(Data.data(), range<1>(BufferSize + OffsetSize + 1), - {property::buffer::use_host_ptr()}) {} + {property::buffer::use_host_ptr()}), + // This buffer is used to pass provided flsuh buffer size to the device + FlushBufferSize(&MaxStatementSize_, range<1>(1), + {property::buffer::use_host_ptr()}) {} size_t stream_impl::get_size() const { return BufferSize_; } diff --git a/sycl/source/stream.cpp b/sycl/source/stream.cpp index 599a03299d4c..4a67826b58bb 100644 --- a/sycl/source/stream.cpp +++ b/sycl/source/stream.cpp @@ -14,10 +14,32 @@ namespace sycl { stream::stream(size_t BufferSize, size_t MaxStatementSize, handler &CGH) : impl(std::make_shared(BufferSize, MaxStatementSize, CGH)), - Acc(impl->getAccessor(CGH)), OffsetAcc(impl->getOffsetAccessor(CGH)) { + GlobalBuf(impl->accessGlobalBuf(CGH)), + GlobalOffset(impl->accessGlobalOffset(CGH)), + // Allocate pool of flush buffers, which contains space for each work item + // in the work group + FlushBufs(MaxStatementSize, CGH), + // Offset of the WI's flush buffer in the pool, we need atomic access to + // this offset to differentiate work items so that output from work items + // is not mixed + WIOffsetAcc(range<1>(1), CGH), + FlushSize(impl->accessFlushBufferSize(CGH)), + FlushBufferSize(MaxStatementSize) { + // Save stream implementation in the handler so that stream will be alive // during kernel execution CGH.addStream(impl); + + // Set flag identifying that created local accessor has perWI size. Accessor + // will be resized in SYCL RT when number of work items per work group will be + // available. Local memory size and max work group size is provided to the + // accessor. This info is used to do allocation if work group size is not + // provided by user. + detail::getSyclObjImpl(FlushBufs)->PerWI = true; + detail::getSyclObjImpl(FlushBufs)->LocalMemSize = + CGH.MQueue->get_device().get_info(); + detail::getSyclObjImpl(FlushBufs)->MaxWGSize = + CGH.MQueue->get_device().get_info(); } size_t stream::get_size() const { return impl->get_size(); } diff --git a/sycl/test/basic_tests/stream/auto_flush.cpp b/sycl/test/basic_tests/stream/auto_flush.cpp new file mode 100644 index 000000000000..4f28c21e52c9 --- /dev/null +++ b/sycl/test/basic_tests/stream/auto_flush.cpp @@ -0,0 +1,34 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// TODO: Enable on host when commands cleanup will be implemented in scheduler +// RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER +// RUN: %GPU_RUN_ON_LINUX_PLACEHOLDER %t.out %GPU_CHECK_ON_LINUX_PLACEHOLDER +// RUN: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER +// TODO: SYCL specific fail - analyze and enable +//==-------------- copy.cpp - SYCL stream obect auto flushing test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int main() { + queue Queue; + + // Test that data is flushed to the buffer at the end of kernel execution even + // without explicit flush + Queue.submit([&](handler &CGH) { + stream Out(1024, 80, CGH); + CGH.parallel_for( + range<1>(2), [=](id<1> i) { Out << "Hello World!\n"; }); + }); + Queue.wait(); + // CHECK: Hello World! + // CHECK-NEXT: Hello World! + + return 0; +} diff --git a/sycl/test/basic_tests/stream.cpp b/sycl/test/basic_tests/stream/stream.cpp similarity index 96% rename from sycl/test/basic_tests/stream.cpp rename to sycl/test/basic_tests/stream/stream.cpp index 174cee682bcb..a5af301774c1 100644 --- a/sycl/test/basic_tests/stream.cpp +++ b/sycl/test/basic_tests/stream/stream.cpp @@ -208,7 +208,7 @@ int main() { Queue.submit([&](handler &CGH) { stream Out(1024, 80, CGH); CGH.parallel_for( - range<1>(10), [=](id<1> i) { Out << "Hello, World!\n"; }); + range<1>(10), [=](id<1> i) { Out << "Hello, World!" << endl; }); }); Queue.wait(); // CHECK-NEXT: Hello, World! @@ -237,7 +237,7 @@ int main() { // CHECK-NEXT: nd_item(global_id: {1, 2, 3}, local_id: {1, 0, 1}) Queue.submit([&](handler &CGH) { - stream Out(1024, 80, CGH); + stream Out(1024, 200, CGH); CGH.parallel_for_work_group( range<3>(1, 1, 1), range<3>(1, 1, 1), [=](group<3> Group) { Group.parallel_for_work_item( @@ -253,10 +253,10 @@ int main() { // Multiple streams in command group Queue.submit([&](handler &CGH) { stream Out1(1024, 80, CGH); - stream Out2(500, 10, CGH); + stream Out2(500, 20, CGH); CGH.parallel_for(range<1>(2), [=](id<1> i) { - Out1 << "Hello, World!\n"; - Out2 << "Hello, World!\n"; + Out1 << "Hello, World!" << endl; + Out2 << "Hello, World!" << endl; }); }); Queue.wait(); @@ -270,7 +270,7 @@ int main() { Queue.submit([&](handler &CGH) { stream Out(10, 10, CGH); CGH.parallel_for( - range<1>(2), [=](id<1> i) { Out << "aaaaaaaaa\n"; }); + range<1>(2), [=](id<1> i) { Out << "aaaaaaaaa" << endl; }); }); Queue.wait(); } diff --git a/sycl/test/linear_id/linear-host-dev.cpp b/sycl/test/linear_id/linear-host-dev.cpp index 89437da13556..e321c7df0be6 100644 --- a/sycl/test/linear_id/linear-host-dev.cpp +++ b/sycl/test/linear_id/linear-host-dev.cpp @@ -42,7 +42,7 @@ int main(int argc, char *argv[]) { // CHECK-NEXT: 3 // CHECK-NEXT: 4 // CHECK-NEXT: 5 - out << item.get_linear_id() << "\n"; + out << item.get_linear_id() << cl::sycl::endl; }); });