diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index e440e21449e87..52ccdc1a7e815 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -63,6 +63,10 @@ class Util { /// sampler class. static bool isSyclSamplerType(const QualType &Ty); + /// Checks whether given clang type is a full specialization of the SYCL + /// stream class. + static bool isSyclStreamType(const QualType &Ty); + /// Checks whether given clang type is a standard SYCL API class with given /// name. /// \param Ty the clang type being checked @@ -770,7 +774,7 @@ static CompoundStmt *CreateOpenCLKernelBody(Sema &S, // All special SYCL objects must have __init method CXXMethodDecl *InitMethod = getInitMethod(CRD); assert(InitMethod && - "The accessor/sampler must have the __init method"); + "The accessor/sampler/stream must have the __init method"); unsigned NumParams = InitMethod->getNumParams(); llvm::SmallVector ParamDREs(NumParams); auto KFP = KernelFuncParam; @@ -780,7 +784,9 @@ static CompoundStmt *CreateOpenCLKernelBody(Sema &S, S.Context, NestedNameSpecifierLoc(), SourceLocation(), *KFP, false, DeclarationNameInfo(), ParamType, VK_LValue); } - std::advance(KernelFuncParam, NumParams - 1); + + if (NumParams) + std::advance(KernelFuncParam, NumParams - 1); DeclAccessPair FieldDAP = DeclAccessPair::make(Field, AS_none); // [kernel_obj or wrapper object].special_obj @@ -909,6 +915,11 @@ static CompoundStmt *CreateOpenCLKernelBody(Sema &S, DeclarationNameInfo(Field->getDeclName(), SourceLocation()), nullptr, Field->getType(), VK_LValue, OK_Ordinary, NOUR_None); getExprForWrappedAccessorInit(CRD, Lhs); + if (Util::isSyclStreamType(FieldType)) { + // Generate call to the __init method of the stream class after + // initializing accessors wrapped by this stream object + getExprForSpecialSYCLObj(FieldType, Field, CRD, KernelObjCloneRef); + } } } else { llvm_unreachable("Unsupported field type"); @@ -1714,6 +1725,10 @@ bool Util::isSyclSamplerType(const QualType &Ty) { return isSyclType(Ty, "sampler"); } +bool Util::isSyclStreamType(const QualType &Ty) { + return isSyclType(Ty, "stream"); +} + bool Util::isSyclType(const QualType &Ty, StringRef Name, bool Tmpl) { Decl::Kind ClassDeclKind = Tmpl ? Decl::Kind::ClassTemplateSpecialization : Decl::Kind::CXXRecord; diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index 1672a4ffe26d6..043979ac03f76 100644 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -1111,26 +1111,26 @@ class accessor> - operator atomic() const { + template + operator typename detail::enable_if_t< + Dims == 0 && AccessMode == access::mode::atomic, atomic>() + const { return atomic(multi_ptr(getQualifiedPtr())); } - template 0) && - AccessMode == access::mode::atomic>> - atomic operator[](id Index) const { + template + typename detail::enable_if_t<(Dims > 0) && AccessMode == access::mode::atomic, + atomic> + operator[](id Index) const { const size_t LinearIndex = getLinearIndex(Index); return atomic( multi_ptr(getQualifiedPtr() + LinearIndex)); } - template > - atomic operator[](size_t Index) const { + template + typename detail::enable_if_t> + operator[](size_t Index) const { return atomic(multi_ptr(getQualifiedPtr() + Index)); } diff --git a/sycl/include/CL/sycl/detail/accessor_impl.hpp b/sycl/include/CL/sycl/detail/accessor_impl.hpp index 131b353cea089..96efd332118bb 100644 --- a/sycl/include/CL/sycl/detail/accessor_impl.hpp +++ b/sycl/include/CL/sycl/detail/accessor_impl.hpp @@ -141,6 +141,23 @@ class LocalAccessorImplHost { int MDims; int MElemSize; std::vector MMem; + + bool PerWI = false; + size_t LocalMemSize; + size_t MaxWGSize; + void resize(size_t LocalSize, size_t GlobalSize) { + if (GlobalSize != 1 && LocalSize != 1) { + // If local size is not specified then work group size is chosen by + // runtime. That is why try to allocate based on max work group size or + // global size. In the worst case allocate 80% of local memory. + size_t MinEstWGSize = LocalSize ? LocalSize : GlobalSize; + MinEstWGSize = MinEstWGSize > MaxWGSize ? MaxWGSize : MinEstWGSize; + size_t NewSize = MinEstWGSize * MSize[0]; + MSize[0] = + NewSize > 8 * LocalMemSize / 10 ? 8 * LocalMemSize / 10 : NewSize; + MMem.resize(NewSize * MElemSize); + } + } }; class LocalAccessorBaseHost { diff --git a/sycl/include/CL/sycl/detail/stream_impl.hpp b/sycl/include/CL/sycl/detail/stream_impl.hpp index f1c5accf533b0..305f2d2298d17 100644 --- a/sycl/include/CL/sycl/detail/stream_impl.hpp +++ b/sycl/include/CL/sycl/detail/stream_impl.hpp @@ -19,6 +19,7 @@ namespace cl { namespace sycl { namespace detail { + using FmtFlags = unsigned int; // Mapping from stream_manipulator to FmtFlags. Each manipulator corresponds @@ -78,31 +79,49 @@ using EnableIfSwizzleVec = class stream_impl { public: - using AccessorType = accessor; + using GlobalBufAccessorT = + accessor; - using OffsetAccessorType = + using GlobalOffsetAccessorT = accessor; + using FlushBufAccessorT = + accessor; + + using LocalOffsetAccessorT = + accessor; + stream_impl(size_t BufferSize, size_t MaxStatementSize, handler &CGH); - // Method to provide an access to the stream buffer - AccessorType getAccessor(handler &CGH) { + // Method to provide an access to the global stream buffer + GlobalBufAccessorT accessGlobalBuf(handler &CGH) { return Buf.get_access( CGH, range<1>(BufferSize_), id<1>(OffsetSize)); } - // Method to provide an atomic access to the offset in the stream buffer - OffsetAccessorType getOffsetAccessor(handler &CGH) { + // Method to provide an atomic access to the offset in the global stream + // buffer + GlobalOffsetAccessorT accessGlobalOffset(handler &CGH) { auto OffsetSubBuf = buffer(Buf, id<1>(0), range<1>(OffsetSize)); auto ReinterpretedBuf = OffsetSubBuf.reinterpret(range<1>(1)); return ReinterpretedBuf.get_access( CGH, range<1>(1), id<1>(0)); } + // Method to provide an atomic access to the flush buffer size + GlobalOffsetAccessorT accessFlushBufferSize(handler &CGH) { + return FlushBufferSize.get_access( + CGH, range<1>(1), id<1>(0)); + } + // Copy stream buffer to the host and print the contents void flush(); @@ -116,7 +135,7 @@ class stream_impl { // Maximum number of symbols which could be streamed from the beginning of a // statement till the semicolon - size_t MaxStatementSize_; + unsigned MaxStatementSize_; // Size of the variable which is used as an offset in the stream buffer. // Additinonal memory is allocated in the beginning of the stream buffer for @@ -128,6 +147,9 @@ class stream_impl { // Stream buffer buffer Buf; + + // Buffer for flush buffer size + buffer FlushBufferSize; }; template @@ -268,38 +290,55 @@ EnableIfFP floatingPointToDecStr(T AbsVal, char *Digits, return Offset; } -// Helper method to update offset atomically according to the provided -// operand size of the output operator. Return true if offset is updated and -// false in case of overflow. -inline bool updateOffset(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Size, - unsigned &Cur) { +// Helper method to update offset in the global buffer atomically according to +// the provided size of the data in the flush buffer. Return true if offset is +// updated and false in case of overflow. +inline bool updateOffset(stream_impl::GlobalOffsetAccessorT &GlobalOffset, + stream_impl::GlobalBufAccessorT &GlobalBuf, + unsigned Size, unsigned &Cur) { unsigned New; + Cur = GlobalOffset[0].load(); do { - Cur = OffsetAcc[0].load(); - if (Acc.get_count() - Cur < Size) + if (GlobalBuf.get_range().size() - Cur < Size) // Overflow return false; New = Cur + Size; - } while (!OffsetAcc[0].compare_exchange_strong(Cur, New)); + } while (!GlobalOffset[0].compare_exchange_strong(Cur, New)); return true; } -inline void write(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Len, const char *Buf, - unsigned Padding = 0) { +inline void flushBuffer(stream_impl::GlobalOffsetAccessorT &GlobalOffset, + stream_impl::GlobalBufAccessorT &GlobalBuf, + stream_impl::FlushBufAccessorT &FlushBufs, + unsigned &WIOffset, unsigned &Offset) { + // Copy data from flush buffer (local memory) to global buffer (global + // memory) unsigned Cur = 0; - if (!updateOffset(OffsetAcc, Acc, Len + Padding, Cur)) + if (!updateOffset(GlobalOffset, GlobalBuf, Offset, Cur)) return; - size_t I = 0; + for (unsigned I = WIOffset; I < WIOffset + Offset; I++) { + GlobalBuf[Cur++] = FlushBufs[I]; + } + // Reset the offset in the flush buffer + Offset = 0; +} + +inline void write(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset, + const char *Str, unsigned Len, unsigned Padding = 0) { + if ((FlushBufferSize - Offset < Len + Padding) || + (WIOffset + Offset + Len + Padding > FlushBufs.get_count())) + // TODO: flush here + return; // Write padding - for (; I < Padding; ++I) - Acc[I + Cur] = ' '; + for (size_t I = 0; I < Padding; ++I, ++Offset) + FlushBufs[WIOffset + Offset] = ' '; - for (; I < Len; I++) - Acc[I + Cur] = Buf[I]; + for (size_t I = 0; I < Len; ++I, ++Offset) { + FlushBufs[WIOffset + Offset] = Str[I]; + } } inline void reverseBuf(char *Buf, unsigned Len) { @@ -437,12 +476,12 @@ ScalarToStr(const T &Val, char *Buf, unsigned Flags, int Width, template inline typename std::enable_if::value>::type -writeIntegral(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Flags, int Width, +writeIntegral(stream_impl::FlushBufAccessorT &FlushBufs, size_t FlushBufferSize, + unsigned WIOffset, unsigned &Offset, unsigned Flags, int Width, const T &Val) { char Digits[MAX_INTEGRAL_DIGITS] = {0}; unsigned Len = ScalarToStr(Val, Digits, Flags, Width); - write(OffsetAcc, Acc, Len, Digits, + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Digits, Len, (Width > 0 && static_cast(Width) > Len) ? static_cast(Width) - Len : 0); @@ -450,12 +489,12 @@ writeIntegral(stream_impl::OffsetAccessorType &OffsetAcc, template inline EnableIfFP -writeFloatingPoint(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Flags, int Width, - int Precision, const T &Val) { +writeFloatingPoint(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset, + unsigned Flags, int Width, int Precision, const T &Val) { char Digits[MAX_FLOATING_POINT_DIGITS] = {0}; unsigned Len = ScalarToStr(Val, Digits, Flags, Width, Precision); - write(OffsetAcc, Acc, Len, Digits, + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Digits, Len, (Width > 0 && static_cast(Width) > Len) ? static_cast(Width) - Len : 0); @@ -493,15 +532,16 @@ VecToStr(const vec &Vec, char *VecStr, unsigned Flags, int Width, } template -inline void writeVec(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, unsigned Flags, int Width, - int Precision, const vec &Vec) { +inline void writeVec(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, unsigned Flags, int Width, int Precision, + const vec &Vec) { // Reserve space for vector elements and delimiters constexpr size_t MAX_VEC_SIZE = MAX_FLOATING_POINT_DIGITS * VecLength + (VecLength - 1) * 2; char VecStr[MAX_VEC_SIZE] = {0}; unsigned Len = VecToStr(Vec, VecStr, Flags, Width, Precision); - write(OffsetAcc, Acc, Len, VecStr, + write(FlushBufs, FlushBufferSize, WIOffset, Offset, VecStr, Len, (Width > 0 && Width > Len) ? Width - Len : 0); } @@ -522,18 +562,18 @@ inline unsigned ArrayToStr(char *Buf, const array &Arr) { } template -inline void writeArray(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const array &Arr) { +inline void writeArray(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const array &Arr) { char Buf[MAX_ARRAY_SIZE]; unsigned Len = ArrayToStr(Buf, Arr); - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } template -inline void writeItem(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const item &Item) { +inline void writeItem(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const item &Item) { // Reserve space for 3 arrays and additional place (40 symbols) for printing // the text char Buf[3 * MAX_ARRAY_SIZE + 40]; @@ -546,12 +586,13 @@ inline void writeItem(stream_impl::OffsetAccessorType &OffsetAcc, Len += append(Buf + Len, ", offset: "); Len += ArrayToStr(Buf + Len, Item.get_offset()); Buf[Len++] = ')'; - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } template -inline void writeNDRange(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, +inline void writeNDRange(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const nd_range &ND_Range) { // Reserve space for 3 arrays and additional place (50 symbols) for printing // the text @@ -565,13 +606,13 @@ inline void writeNDRange(stream_impl::OffsetAccessorType &OffsetAcc, Len += append(Buf + Len, ", offset: "); Len += ArrayToStr(Buf + Len, ND_Range.get_offset()); Buf[Len++] = ')'; - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } template -inline void writeNDItem(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const nd_item &ND_Item) { +inline void writeNDItem(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const nd_item &ND_Item) { // Reserve space for 2 arrays and additional place (40 symbols) for printing // the text char Buf[2 * MAX_ARRAY_SIZE + 40]; @@ -582,13 +623,13 @@ inline void writeNDItem(stream_impl::OffsetAccessorType &OffsetAcc, Len += append(Buf + Len, ", local_id: "); Len += ArrayToStr(Buf + Len, ND_Item.get_local_id()); Buf[Len++] = ')'; - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } template -inline void writeGroup(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const group &Group) { +inline void writeGroup(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const group &Group) { // Reserve space for 4 arrays and additional place (60 symbols) for printing // the text char Buf[4 * MAX_ARRAY_SIZE + 60]; @@ -603,7 +644,7 @@ inline void writeGroup(stream_impl::OffsetAccessorType &OffsetAcc, Len += append(Buf + Len, ", group_range: "); Len += ArrayToStr(Buf + Len, Group.get_group_range()); Buf[Len++] = ')'; - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } // Space for 2 arrays and additional place (20 symbols) for printing @@ -623,9 +664,9 @@ inline unsigned ItemToStr(char *Buf, const item &Item) { } template -inline void writeHItem(stream_impl::OffsetAccessorType &OffsetAcc, - stream_impl::AccessorType &Acc, - const h_item &HItem) { +inline void writeHItem(stream_impl::FlushBufAccessorT &FlushBufs, + size_t FlushBufferSize, unsigned WIOffset, + unsigned &Offset, const h_item &HItem) { // Reserve space for 3 items and additional place (60 symbols) for printing // the text char Buf[3 * MAX_ITEM_SIZE + 60]; @@ -640,7 +681,7 @@ inline void writeHItem(stream_impl::OffsetAccessorType &OffsetAcc, : HItem.get_physical_local()); } Len += append(Buf + Len, "\n)"); - write(OffsetAcc, Acc, Len, Buf); + write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len); } } // namespace detail diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 531dea83e4372..3dd2bfea591af 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -286,6 +286,15 @@ class handler { case access::target::local: { detail::LocalAccessorBaseHost *LAcc = static_cast(Ptr); + // Stream implementation creates local accessor with size per work item + // in work group. Number of work items is not available during stream + // construction, that is why size of the accessor is updated here using + // information about number of work items in the work group. + if (detail::getSyclObjImpl(*LAcc)->PerWI) { + auto LocalAccImpl = detail::getSyclObjImpl(*LAcc); + LocalAccImpl->resize(MNDRDesc.LocalSize.size(), + MNDRDesc.GlobalSize.size()); + } range<3> &Size = LAcc->getSize(); const int Dims = LAcc->getNumOfDims(); int SizeInBytes = LAcc->getElementSize(); @@ -526,6 +535,7 @@ class handler { friend class detail::image_accessor; // Make stream class friend to be able to keep the list of associated streams friend class stream; + friend class detail::stream_impl; public: handler(const handler &) = delete; diff --git a/sycl/include/CL/sycl/stream.hpp b/sycl/include/CL/sycl/stream.hpp index 5401b0d55511d..5c78eda50e6d3 100644 --- a/sycl/include/CL/sycl/stream.hpp +++ b/sycl/include/CL/sycl/stream.hpp @@ -22,6 +22,7 @@ enum class stream_manipulator { noshowpos, showpos, endl, + flush, fixed, scientific, hexfloat, @@ -44,6 +45,8 @@ constexpr stream_manipulator showpos = stream_manipulator::showpos; constexpr stream_manipulator endl = stream_manipulator::endl; +constexpr stream_manipulator flush = stream_manipulator::flush; + constexpr stream_manipulator fixed = stream_manipulator::fixed; constexpr stream_manipulator scientific = stream_manipulator::scientific; @@ -104,6 +107,18 @@ class stream { bool operator!=(const stream &LHS) const; + ~stream() { + // Flush data to global buffer in stream destruction if flush buffer is not + // empty. This could be necessary if user hasn't flushed data himself and + // kernel execution is finished + // NOTE: In the current implementation user should explicitly flush data on + // the host device. Data is not flushed automatically after kernel execution + // because of the missing feature in scheduler. + if (Offset) { + flushBuffer(GlobalOffset, GlobalBuf, FlushBufs, WIOffset, Offset); + } + } + private: #ifdef __SYCL_DEVICE_ONLY__ char padding[sizeof(std::shared_ptr)]; @@ -113,15 +128,39 @@ class stream { friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject); #endif - // Accessor to stream buffer - mutable detail::stream_impl::AccessorType Acc; + // Accessor to the global stream buffer. Global buffer contains all output + // from the kernel. + mutable detail::stream_impl::GlobalBufAccessorT GlobalBuf; - // Atomic accessor to the offset variable. It represents an offset in the - // stream buffer. - mutable detail::stream_impl::OffsetAccessorType OffsetAcc; - mutable stream_manipulator Manipulator = defaultfloat; + // Atomic accessor to the global offset variable. It represents an offset in + // the global stream buffer. Since work items will flush data to global buffer + // in parallel we need atomic access to this offset. + mutable detail::stream_impl::GlobalOffsetAccessorT GlobalOffset; + + // Accessor to the pool of flush buffers. Flush buffer contains output from + // work item in the work group. One flush buffer per work item in the work + // group. + mutable detail::stream_impl::FlushBufAccessorT FlushBufs; + + // Each work item in the work group writes to its own flush buffer in the + // pool. This accessor is used to atomically get offset of the flush buffer in + // the pool for each work item in the work group. This approach is used + // because currently it is not possible to get work item id in the work group + // without id object, which is passed to the kernel. + mutable detail::stream_impl::LocalOffsetAccessorT WIOffsetAcc; + + mutable detail::stream_impl::GlobalOffsetAccessorT FlushSize; + + // Offset of the WI's flush buffer in the pool. + mutable unsigned WIOffset = 0; + + // Offset in the flush buffer + mutable unsigned Offset = 0; + + mutable size_t FlushBufferSize; // Fields and methods to work with manipulators + mutable stream_manipulator Manipulator = defaultfloat; // Type used for format flags using FmtFlags = unsigned int; @@ -186,6 +225,19 @@ class stream { } } +#ifdef __SYCL_DEVICE_ONLY__ + void __init() { + // Calculate work item id inside work group, this should be done once, that + // is why this is done in _init method, call to __init method is generated + // by frontend. As a result each work item will write to its own flush + // buffer. + FlushBufferSize = FlushSize[0].load(); + WIOffsetAcc[0].store(0); + detail::workGroupBarrier(); + WIOffset = WIOffsetAcc[0].fetch_add(FlushBufferSize); + } +#endif + friend const stream &operator<<(const stream &, const char); friend const stream &operator<<(const stream &, const char *); template @@ -195,7 +247,9 @@ class stream { friend const stream &operator<<(const stream &, const float &); friend const stream &operator<<(const stream &, const double &); friend const stream &operator<<(const stream &, const half &); + friend const stream &operator<<(const stream &, const stream_manipulator); + friend const stream &operator<<(const stream &Out, const __precision_manipulator__ &RHS); @@ -236,20 +290,22 @@ class stream { // Character inline const stream &operator<<(const stream &Out, const char C) { - unsigned Cur; - if (!detail::updateOffset(Out.OffsetAcc, Out.Acc, 1, Cur)) + if (Out.Offset >= Out.FlushBufferSize || + Out.WIOffset + Out.Offset + 1 > Out.FlushBufs.get_count()) return Out; - Out.Acc[Cur] = C; + Out.FlushBufs[Out.WIOffset + Out.Offset] = C; + ++Out.Offset; return Out; } // String inline const stream &operator<<(const stream &Out, const char *Str) { - unsigned Len; - for (Len = 0; Str[Len] != '\0'; Len++) + unsigned Len = 0; + for (; Str[Len] != '\0'; Len++) ; - detail::write(Out.OffsetAcc, Out.Acc, Len, Str); + detail::write(Out.FlushBufs, Out.FlushBufferSize, Out.WIOffset, Out.Offset, + Str, Len); return Out; } @@ -264,27 +320,30 @@ template inline typename std::enable_if::value, const stream &>::type operator<<(const stream &Out, const ValueType &RHS) { - detail::writeIntegral(Out.OffsetAcc, Out.Acc, Out.get_flags(), - Out.get_width(), RHS); + detail::writeIntegral(Out.FlushBufs, Out.FlushBufferSize, Out.WIOffset, + Out.Offset, Out.get_flags(), Out.get_width(), RHS); return Out; } // Floating points inline const stream &operator<<(const stream &Out, const float &RHS) { - detail::writeFloatingPoint(Out.OffsetAcc, Out.Acc, Out.get_flags(), + detail::writeFloatingPoint(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, Out.get_flags(), Out.get_width(), Out.get_precision(), RHS); return Out; } inline const stream &operator<<(const stream &Out, const double &RHS) { - detail::writeFloatingPoint(Out.OffsetAcc, Out.Acc, Out.get_flags(), + detail::writeFloatingPoint(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, Out.get_flags(), Out.get_width(), Out.get_precision(), RHS); return Out; } inline const stream &operator<<(const stream &Out, const half &RHS) { - detail::writeFloatingPoint(Out.OffsetAcc, Out.Acc, Out.get_flags(), + detail::writeFloatingPoint(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, Out.get_flags(), Out.get_width(), Out.get_precision(), RHS); return Out; } @@ -303,7 +362,8 @@ const stream &operator<<(const stream &Out, const T *RHS) { detail::FmtFlags Flags = Out.get_flags(); Flags &= ~detail::BaseField; Flags |= detail::Hex | detail::ShowBase; - detail::writeIntegral(Out.OffsetAcc, Out.Acc, Flags, Out.get_width(), + detail::writeIntegral(Out.FlushBufs, Out.FlushBufferSize, Out.WIOffset, + Out.Offset, Flags, Out.get_width(), reinterpret_cast(RHS)); return Out; } @@ -312,13 +372,13 @@ const stream &operator<<(const stream &Out, const T *RHS) { inline const stream &operator<<(const stream &Out, const __precision_manipulator__ &RHS) { - Out.Width = RHS.precision(); + Out.Precision = RHS.precision(); return Out; } inline const stream &operator<<(const stream &Out, const __width_manipulator__ &RHS) { - Out.Precision = RHS.width(); + Out.Width = RHS.width(); return Out; } @@ -327,6 +387,12 @@ inline const stream &operator<<(const stream &Out, switch (RHS) { case stream_manipulator::endl: Out << '\n'; + flushBuffer(Out.GlobalOffset, Out.GlobalBuf, Out.FlushBufs, Out.WIOffset, + Out.Offset); + break; + case stream_manipulator::flush: + flushBuffer(Out.GlobalOffset, Out.GlobalBuf, Out.FlushBufs, Out.WIOffset, + Out.Offset); break; default: Out.set_manipulator(RHS); @@ -339,7 +405,8 @@ inline const stream &operator<<(const stream &Out, template const stream &operator<<(const stream &Out, const vec &RHS) { - detail::writeVec(Out.OffsetAcc, Out.Acc, Out.get_flags(), + detail::writeVec(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, Out.get_flags(), Out.get_width(), Out.get_precision(), RHS); return Out; } @@ -348,49 +415,56 @@ const stream &operator<<(const stream &Out, const vec &RHS) { template inline const stream &operator<<(const stream &Out, const id &RHS) { - detail::writeArray(Out.OffsetAcc, Out.Acc, RHS); + detail::writeArray(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const range &RHS) { - detail::writeArray(Out.OffsetAcc, Out.Acc, RHS); + detail::writeArray(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const item &RHS) { - detail::writeItem(Out.OffsetAcc, Out.Acc, RHS); + detail::writeItem(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const nd_range &RHS) { - detail::writeNDRange(Out.OffsetAcc, Out.Acc, RHS); + detail::writeNDRange(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const nd_item &RHS) { - detail::writeNDItem(Out.OffsetAcc, Out.Acc, RHS); + detail::writeNDItem(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const group &RHS) { - detail::writeGroup(Out.OffsetAcc, Out.Acc, RHS); + detail::writeGroup(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } template inline const stream &operator<<(const stream &Out, const h_item &RHS) { - detail::writeHItem(Out.OffsetAcc, Out.Acc, RHS); + detail::writeHItem(Out.FlushBufs, Out.FlushBufferSize, + Out.WIOffset, Out.Offset, RHS); return Out; } diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 1f8136410880d..e66cd11ea6cb1 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -23,7 +23,10 @@ stream_impl::stream_impl(size_t BufferSize, size_t MaxStatementSize, // 2. Offset is properly initialized. Data(BufferSize + OffsetSize + 1, 0), Buf(Data.data(), range<1>(BufferSize + OffsetSize + 1), - {property::buffer::use_host_ptr()}) {} + {property::buffer::use_host_ptr()}), + // This buffer is used to pass provided flsuh buffer size to the device + FlushBufferSize(&MaxStatementSize_, range<1>(1), + {property::buffer::use_host_ptr()}) {} size_t stream_impl::get_size() const { return BufferSize_; } diff --git a/sycl/source/stream.cpp b/sycl/source/stream.cpp index 599a03299d4c8..4a67826b58bb7 100644 --- a/sycl/source/stream.cpp +++ b/sycl/source/stream.cpp @@ -14,10 +14,32 @@ namespace sycl { stream::stream(size_t BufferSize, size_t MaxStatementSize, handler &CGH) : impl(std::make_shared(BufferSize, MaxStatementSize, CGH)), - Acc(impl->getAccessor(CGH)), OffsetAcc(impl->getOffsetAccessor(CGH)) { + GlobalBuf(impl->accessGlobalBuf(CGH)), + GlobalOffset(impl->accessGlobalOffset(CGH)), + // Allocate pool of flush buffers, which contains space for each work item + // in the work group + FlushBufs(MaxStatementSize, CGH), + // Offset of the WI's flush buffer in the pool, we need atomic access to + // this offset to differentiate work items so that output from work items + // is not mixed + WIOffsetAcc(range<1>(1), CGH), + FlushSize(impl->accessFlushBufferSize(CGH)), + FlushBufferSize(MaxStatementSize) { + // Save stream implementation in the handler so that stream will be alive // during kernel execution CGH.addStream(impl); + + // Set flag identifying that created local accessor has perWI size. Accessor + // will be resized in SYCL RT when number of work items per work group will be + // available. Local memory size and max work group size is provided to the + // accessor. This info is used to do allocation if work group size is not + // provided by user. + detail::getSyclObjImpl(FlushBufs)->PerWI = true; + detail::getSyclObjImpl(FlushBufs)->LocalMemSize = + CGH.MQueue->get_device().get_info(); + detail::getSyclObjImpl(FlushBufs)->MaxWGSize = + CGH.MQueue->get_device().get_info(); } size_t stream::get_size() const { return impl->get_size(); } diff --git a/sycl/test/basic_tests/stream/auto_flush.cpp b/sycl/test/basic_tests/stream/auto_flush.cpp new file mode 100644 index 0000000000000..4f28c21e52c98 --- /dev/null +++ b/sycl/test/basic_tests/stream/auto_flush.cpp @@ -0,0 +1,34 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// TODO: Enable on host when commands cleanup will be implemented in scheduler +// RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER +// RUN: %GPU_RUN_ON_LINUX_PLACEHOLDER %t.out %GPU_CHECK_ON_LINUX_PLACEHOLDER +// RUN: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER +// TODO: SYCL specific fail - analyze and enable +//==-------------- copy.cpp - SYCL stream obect auto flushing test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int main() { + queue Queue; + + // Test that data is flushed to the buffer at the end of kernel execution even + // without explicit flush + Queue.submit([&](handler &CGH) { + stream Out(1024, 80, CGH); + CGH.parallel_for( + range<1>(2), [=](id<1> i) { Out << "Hello World!\n"; }); + }); + Queue.wait(); + // CHECK: Hello World! + // CHECK-NEXT: Hello World! + + return 0; +} diff --git a/sycl/test/basic_tests/stream.cpp b/sycl/test/basic_tests/stream/stream.cpp similarity index 96% rename from sycl/test/basic_tests/stream.cpp rename to sycl/test/basic_tests/stream/stream.cpp index 174cee682bcb1..a5af301774c1a 100644 --- a/sycl/test/basic_tests/stream.cpp +++ b/sycl/test/basic_tests/stream/stream.cpp @@ -208,7 +208,7 @@ int main() { Queue.submit([&](handler &CGH) { stream Out(1024, 80, CGH); CGH.parallel_for( - range<1>(10), [=](id<1> i) { Out << "Hello, World!\n"; }); + range<1>(10), [=](id<1> i) { Out << "Hello, World!" << endl; }); }); Queue.wait(); // CHECK-NEXT: Hello, World! @@ -237,7 +237,7 @@ int main() { // CHECK-NEXT: nd_item(global_id: {1, 2, 3}, local_id: {1, 0, 1}) Queue.submit([&](handler &CGH) { - stream Out(1024, 80, CGH); + stream Out(1024, 200, CGH); CGH.parallel_for_work_group( range<3>(1, 1, 1), range<3>(1, 1, 1), [=](group<3> Group) { Group.parallel_for_work_item( @@ -253,10 +253,10 @@ int main() { // Multiple streams in command group Queue.submit([&](handler &CGH) { stream Out1(1024, 80, CGH); - stream Out2(500, 10, CGH); + stream Out2(500, 20, CGH); CGH.parallel_for(range<1>(2), [=](id<1> i) { - Out1 << "Hello, World!\n"; - Out2 << "Hello, World!\n"; + Out1 << "Hello, World!" << endl; + Out2 << "Hello, World!" << endl; }); }); Queue.wait(); @@ -270,7 +270,7 @@ int main() { Queue.submit([&](handler &CGH) { stream Out(10, 10, CGH); CGH.parallel_for( - range<1>(2), [=](id<1> i) { Out << "aaaaaaaaa\n"; }); + range<1>(2), [=](id<1> i) { Out << "aaaaaaaaa" << endl; }); }); Queue.wait(); } diff --git a/sycl/test/linear_id/linear-host-dev.cpp b/sycl/test/linear_id/linear-host-dev.cpp index 89437da135569..e321c7df0be63 100644 --- a/sycl/test/linear_id/linear-host-dev.cpp +++ b/sycl/test/linear_id/linear-host-dev.cpp @@ -42,7 +42,7 @@ int main(int argc, char *argv[]) { // CHECK-NEXT: 3 // CHECK-NEXT: 4 // CHECK-NEXT: 5 - out << item.get_linear_id() << "\n"; + out << item.get_linear_id() << cl::sycl::endl; }); });