diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h index 2e96ab77026de..0b388ae4b09c6 100644 --- a/include/swift/Parse/Lexer.h +++ b/include/swift/Parse/Lexer.h @@ -399,24 +399,48 @@ class Lexer { } }; - + + /// Implementation of getEncodedStringSegment. Note that \p Str must support + /// reading one byte past the end. + static StringRef getEncodedStringSegmentImpl(StringRef Str, + SmallVectorImpl &Buffer, + bool IsFirstSegment, + bool IsLastSegment, + unsigned IndentToStrip, + unsigned CustomDelimiterLen); + /// \brief Compute the bytes that the actual string literal should codegen to. /// If a copy needs to be made, it will be allocated out of the provided - /// Buffer. - static StringRef getEncodedStringSegment(StringRef Str, - SmallVectorImpl &Buffer, - bool IsFirstSegment = false, - bool IsLastSegment = false, - unsigned IndentToStrip = 0, - unsigned CustomDelimiterLen = 0); + /// \p Buffer. StringRef getEncodedStringSegment(StringSegment Segment, SmallVectorImpl &Buffer) const { - return getEncodedStringSegment( + return getEncodedStringSegmentImpl( StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length), Buffer, Segment.IsFirstSegment, Segment.IsLastSegment, Segment.IndentToStrip, Segment.CustomDelimiterLen); } + /// \brief Given a string encoded with escapes like a string literal, compute + /// the byte content. + /// + /// If a copy needs to be made, it will be allocated out of the provided + /// \p Buffer. + static StringRef getEncodedStringSegment(StringRef Str, + SmallVectorImpl &Buffer) { + SmallString<128> TerminatedStrBuf(Str); + TerminatedStrBuf.push_back('\0'); + StringRef TerminatedStr = StringRef(TerminatedStrBuf).drop_back(); + StringRef Result = getEncodedStringSegmentImpl(TerminatedStr, Buffer, + /*IsFirstSegment*/false, + /*IsLastSegment*/false, + /*IndentToStrip*/0, + /*CustomDelimiterLen*/0); + if (Result == TerminatedStr) + return Str; + assert(Result.data() == Buffer.data()); + return Result; + } + /// \brief Given a string literal token, separate it into string/expr segments /// of a potentially interpolated string. static void getStringLiteralSegments( diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index cbdf56e0dcdef..eb9bc8aed933d 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -2095,17 +2095,18 @@ void Lexer::tryLexEditorPlaceholder() { lexOperatorIdentifier(); } -StringRef Lexer::getEncodedStringSegment(StringRef Bytes, - SmallVectorImpl &TempString, - bool IsFirstSegment, - bool IsLastSegment, - unsigned IndentToStrip, - unsigned CustomDelimiterLen) { +StringRef Lexer::getEncodedStringSegmentImpl(StringRef Bytes, + SmallVectorImpl &TempString, + bool IsFirstSegment, + bool IsLastSegment, + unsigned IndentToStrip, + unsigned CustomDelimiterLen) { TempString.clear(); - // Note that it is always safe to read one over the end of "Bytes" because - // we know that there is a terminating " character. Use BytesPtr to avoid a - // range check subscripting on the StringRef. + // Note that it is always safe to read one over the end of "Bytes" because we + // know that there is a terminating " character (or null byte for an + // unterminated literal or a segment that doesn't come from source). Use + // BytesPtr to avoid a range check subscripting on the StringRef. const char *BytesPtr = Bytes.begin(); bool IsEscapedNewline = false; diff --git a/lib/ParseSIL/ParseSIL.cpp b/lib/ParseSIL/ParseSIL.cpp index 1a16404bdf44a..0ff7ca7c7a995 100644 --- a/lib/ParseSIL/ParseSIL.cpp +++ b/lib/ParseSIL/ParseSIL.cpp @@ -2378,18 +2378,22 @@ bool SILParser::parseSILInstruction(SILBuilder &B) { return true; } - // Drop the double quotes. - StringRef rawString = P.Tok.getText().drop_front().drop_back(); + // Parse the string. + SmallVector segments; + P.L->getStringLiteralSegments(P.Tok, segments); + assert(segments.size() == 1); P.consumeToken(tok::string_literal); if (parseSILDebugLocation(InstLoc, B)) return true; - // Ask the lexer to interpret the entire string as a literal segment. SmallVector stringBuffer; if (encoding == StringLiteralInst::Encoding::Bytes) { // Decode hex bytes. + CharSourceRange rawStringRange(segments.front().Loc, + segments.front().Length); + StringRef rawString = P.SourceMgr.extractText(rawStringRange); if (rawString.size() & 1) { P.diagnose(P.Tok, diag::expected_tok_in_sil_instr, "even number of hex bytes"); @@ -2411,7 +2415,8 @@ bool SILParser::parseSILInstruction(SILBuilder &B) { break; } - StringRef string = P.L->getEncodedStringSegment(rawString, stringBuffer); + StringRef string = P.L->getEncodedStringSegment(segments.front(), + stringBuffer); ResultVal = B.createStringLiteral(InstLoc, string, encoding); break; } diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp index cc0fa97897d90..d694748f45f4e 100644 --- a/unittests/Parse/LexerTests.cpp +++ b/unittests/Parse/LexerTests.cpp @@ -1,12 +1,21 @@ #include "swift/AST/DiagnosticConsumer.h" #include "swift/AST/DiagnosticEngine.h" +#include "swift/Basic/Defer.h" #include "swift/Basic/LangOptions.h" #include "swift/Basic/SourceManager.h" #include "swift/Parse/Lexer.h" #include "swift/Subsystems.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Process.h" #include "gtest/gtest.h" +#if __has_include() +# include +# define HAS_MMAP 1 +#else +# define HAS_MMAP 0 +#endif + using namespace swift; using namespace llvm; @@ -806,3 +815,37 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) { ASSERT_FALSE(containsPrefix( DiagConsumer.messages, "1, 4: nul character embedded in middle of file")); } + +#if HAS_MMAP + +// This test requires mmap because llvm::sys::Memory doesn't support protecting +// pages to have no permissions. +TEST_F(LexerTest, EncodedStringSegmentPastTheEnd) { + size_t PageSize = llvm::sys::Process::getPageSize(); + + void *FirstPage = mmap(/*addr*/nullptr, PageSize * 2, PROT_NONE, + MAP_PRIVATE | MAP_ANON, /*fd*/-1, /*offset*/0); + SWIFT_DEFER { (void)munmap(FirstPage, PageSize * 2); }; + ASSERT_NE(FirstPage, MAP_FAILED); + int ProtectResult = mprotect(FirstPage, PageSize, PROT_READ | PROT_WRITE); + ASSERT_EQ(ProtectResult, 0); + + auto check = [FirstPage, PageSize](StringRef Input, StringRef Expected) { + char *StartPtr = static_cast(FirstPage) + PageSize - Input.size(); + memcpy(StartPtr, Input.data(), Input.size()); + + SmallString<64> Buffer; + StringRef Escaped = Lexer::getEncodedStringSegment({StartPtr, Input.size()}, + Buffer); + EXPECT_EQ(Escaped, Expected); + }; + + check("needs escaping\\r", + "needs escaping\r"); + check("does not need escaping", + "does not need escaping"); + check("invalid escape at the end \\", + "invalid escape at the end "); +} + +#endif // HAS_MMAP