diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def index 556fe66810be4..448f1b6572992 100644 --- a/include/swift/AST/DiagnosticsParse.def +++ b/include/swift/AST/DiagnosticsParse.def @@ -138,6 +138,10 @@ ERROR(lex_invalid_u_escape,none, "\\u{...} escape sequence expects between 1 and 8 hex digits", ()) ERROR(lex_invalid_u_escape_rbrace,none, "expected '}' in \\u{...} escape sequence", ()) +ERROR(lex_invalid_escape_delimiter,none, + "too many '#' characters in delimited escape", ()) +ERROR(lex_invalid_closing_delimiter,none, + "too many '#' characters in closing delimiter", ()) ERROR(lex_invalid_unicode_scalar,none, "invalid unicode scalar", ()) diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h index 5e880628048cf..2e96ab77026de 100644 --- a/include/swift/Parse/Lexer.h +++ b/include/swift/Parse/Lexer.h @@ -364,12 +364,13 @@ class Lexer { enum : char { Literal, Expr } Kind; // Loc+Length for the segment inside the string literal, without quotes. SourceLoc Loc; - unsigned Length, IndentToStrip; + unsigned Length, IndentToStrip, CustomDelimiterLen; bool IsFirstSegment, IsLastSegment; static StringSegment getLiteral(SourceLoc Loc, unsigned Length, bool IsFirstSegment, bool IsLastSegment, - unsigned IndentToStrip) { + unsigned IndentToStrip, + unsigned CustomDelimiterLen) { StringSegment Result; Result.Kind = Literal; Result.Loc = Loc; @@ -377,6 +378,7 @@ class Lexer { Result.IsFirstSegment = IsFirstSegment; Result.IsLastSegment = IsLastSegment; Result.IndentToStrip = IndentToStrip; + Result.CustomDelimiterLen = CustomDelimiterLen; return Result; } @@ -388,6 +390,7 @@ class Lexer { Result.IsFirstSegment = false; Result.IsLastSegment = false; Result.IndentToStrip = 0; + Result.CustomDelimiterLen = 0; return Result; } @@ -404,13 +407,14 @@ class Lexer { SmallVectorImpl &Buffer, bool IsFirstSegment = false, bool IsLastSegment = false, - unsigned IndentToStrip = 0); + unsigned IndentToStrip = 0, + unsigned CustomDelimiterLen = 0); StringRef getEncodedStringSegment(StringSegment Segment, SmallVectorImpl &Buffer) const { return getEncodedStringSegment( StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length), Buffer, Segment.IsFirstSegment, Segment.IsLastSegment, - Segment.IndentToStrip); + Segment.IndentToStrip, Segment.CustomDelimiterLen); } /// \brief Given a string literal token, separate it into string/expr segments @@ -474,7 +478,8 @@ class Lexer { return diagnose(Loc, Diagnostic(DiagID, std::forward(Args)...)); } - void formToken(tok Kind, const char *TokStart, bool MultilineString = false); + void formToken(tok Kind, const char *TokStart, bool IsMultilineString = false, + unsigned CustomDelimiterLen = 0); void formEscapedIdentifierToken(const char *TokStart); /// Advance to the end of the line. @@ -498,10 +503,10 @@ class Lexer { void lexTrivia(syntax::Trivia &T, bool IsForTrailingTrivia); static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags); - unsigned lexCharacter(const char *&CurPtr, - char StopQuote, bool EmitDiagnostics, - bool MultilineString = false); - void lexStringLiteral(); + unsigned lexCharacter(const char *&CurPtr, char StopQuote, + bool EmitDiagnostics, bool IsMultilineString = false, + unsigned CustomDelimiterLen = 0); + void lexStringLiteral(unsigned CustomDelimiterLen = 0); void lexEscapedIdentifier(); void tryLexEditorPlaceholder(); diff --git a/include/swift/Parse/Token.h b/include/swift/Parse/Token.h index 9d1a99a179751..6cd0fc95828c4 100644 --- a/include/swift/Parse/Token.h +++ b/include/swift/Parse/Token.h @@ -45,7 +45,10 @@ class Token { /// Modifiers for string literals unsigned MultilineString : 1; - // Padding bits == 32 - sizeof(Kind) * 8 - 3; + /// Length of custom delimiter of "raw" string literals + unsigned CustomDelimiterLen : 8; + + // Padding bits == 32 - 11; /// \brief The length of the comment that precedes the token. unsigned CommentLength; @@ -62,8 +65,8 @@ class Token { public: Token(tok Kind, StringRef Text, unsigned CommentLength = 0) : Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false), - MultilineString(false), CommentLength(CommentLength), - Text(Text) {} + MultilineString(false), CustomDelimiterLen(0), + CommentLength(CommentLength), Text(Text) {} Token() : Token(tok::NUM_TOKENS, {}, 0) {} @@ -266,17 +269,24 @@ class Token { /// \brief Set the token to the specified kind and source range. void setToken(tok K, StringRef T, unsigned CommentLength = 0, - bool MultilineString = false) { + bool IsMultilineString = false, unsigned CustomDelimiterLen = 0) { Kind = K; Text = T; this->CommentLength = CommentLength; EscapedIdentifier = false; - this->MultilineString = MultilineString; + this->MultilineString = IsMultilineString; + this->CustomDelimiterLen = CustomDelimiterLen; + assert(this->CustomDelimiterLen == CustomDelimiterLen && + "custom string delimiter length > 255"); } - bool IsMultilineString() const { + bool isMultilineString() const { return MultilineString; } + + unsigned getCustomDelimiterLen() const { + return CustomDelimiterLen; + } }; } // end namespace swift diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index a4ec9225b1c4d..d77fda48834d9 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -272,7 +272,8 @@ Token Lexer::getTokenAt(SourceLoc Loc) { return Result; } -void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) { +void Lexer::formToken(tok Kind, const char *TokStart, + bool IsMultilineString, unsigned CustomDelimiterLen) { assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Current pointer out of range!"); @@ -304,7 +305,8 @@ void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) { lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true); } - NextToken.setToken(Kind, TokenText, CommentLength, MultilineString); + NextToken.setToken(Kind, TokenText, CommentLength, + IsMultilineString, CustomDelimiterLen); } void Lexer::formEscapedIdentifierToken(const char *TokStart) { @@ -1211,6 +1213,69 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } +/// diagnoseZeroWidthMatchAndAdvance - Error invisible characters in delimiters. +/// An invisible character in the middle of a delimiter can be used to extend +/// the literal beyond what it would appear creating potential security bugs. +static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr, + DiagnosticEngine *Diags) { + // TODO: Detect, diagnose and skip over zero-width characters if required. + // See https://github.com/apple/swift/pull/17668 for possible implementation. + return *CurPtr == Target && CurPtr++; +} + +/// advanceIfMultilineDelimiter - Centralized check for multiline delimiter. +static bool advanceIfMultilineDelimiter(const char *&CurPtr, + DiagnosticEngine *Diags) { + const char *TmpPtr = CurPtr; + if (*(TmpPtr - 1) == '"' && + diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags) && + diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) { + CurPtr = TmpPtr; + return true; + } + return false; +} + +/// advanceIfCustomDelimiter - Extracts/detects any custom delimiter on +/// opening a string literal, advances CurPtr if a delimiter is found and +/// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called. +static unsigned advanceIfCustomDelimiter(const char *&CurPtr, + DiagnosticEngine *Diags) { + const char *TmpPtr = CurPtr; + unsigned CustomDelimiterLen = 1; + while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) + CustomDelimiterLen++; + if (diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) { + CurPtr = TmpPtr; + return CustomDelimiterLen; + } + return 0; +} + +/// delimiterMatches - Does custom delimiter ('#' characters surrounding quotes) +/// match the number of '#' characters after '\' inside the string? This allows +/// interpolation inside a "raw" string. Normal/cooked string processing is +/// the degenerate case of there being no '#' characters surrounding the quotes. +/// If delimiter matches, advances byte pointer passed in and returns true. +/// Also used to detect the final delimiter of a string when IsClosing == true. +static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr, + DiagnosticEngine *Diags, bool IsClosing = false) { + if (!CustomDelimiterLen) + return true; + const char *TmpPtr = BytesPtr; + while (CustomDelimiterLen--) + if (!diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) + return false; + BytesPtr = TmpPtr; + if (*BytesPtr == '#' && Diags) + Diags->diagnose(Lexer::getSourceLoc(BytesPtr), IsClosing ? + diag::lex_invalid_closing_delimiter : + diag::lex_invalid_escape_delimiter) + .fixItRemoveChars(Lexer::getSourceLoc(BytesPtr), + Lexer::getSourceLoc(BytesPtr + 1)); + return true; +} + /// lexCharacter - Read a character and return its UTF32 code. If this is the /// end of enclosing string/character sequence (i.e. the character is equal to /// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal @@ -1220,7 +1285,8 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { /// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0 /// character_escape ::= unicode_character_escape unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, - bool EmitDiagnostics, bool MultilineString) { + bool EmitDiagnostics, bool IsMultilineString, + unsigned CustomDelimiterLen) { const char *CharStart = CurPtr; switch (*CurPtr++) { @@ -1228,7 +1294,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) >= 0) { if (isPrintable(CurPtr[-1]) == 0) - if (!(MultilineString && (CurPtr[-1] == '\t'))) + if (!(IsMultilineString && (CurPtr[-1] == '\t'))) if (EmitDiagnostics) diagnose(CharStart, diag::lex_unprintable_ascii_character); return CurPtr[-1]; @@ -1263,12 +1329,15 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, return ~1U; case '\n': // String literals cannot have \n or \r in them. case '\r': - if (MultilineString) // ... unless they are multiline + if (IsMultilineString) // ... unless they are multiline return CurPtr[-1]; if (EmitDiagnostics) diagnose(CurPtr-1, diag::lex_unterminated_string); return ~1U; case '\\': // Escapes. + if (!delimiterMatches(CustomDelimiterLen, CurPtr, + EmitDiagnostics ? Diags : nullptr)) + return '\\'; break; } @@ -1276,7 +1345,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, // Escape processing. We already ate the "\". switch (*CurPtr) { case ' ': case '\t': case '\n': case '\r': - if (MultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) + if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) return '\n'; LLVM_FALLTHROUGH; default: // Invalid escape. @@ -1334,10 +1403,11 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, const char *EndPtr, DiagnosticEngine *Diags, - bool MultilineString) { - llvm::SmallVector OpenDelimiters; - llvm::SmallVector AllowNewline; - AllowNewline.push_back(MultilineString); + bool IsMultilineString) { + SmallVector OpenDelimiters; + SmallVector AllowNewline; + SmallVector CustomDelimiter; + AllowNewline.push_back(IsMultilineString); auto inStringLiteral = [&]() { return !OpenDelimiters.empty() && @@ -1352,6 +1422,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // On success scanning the expression body, the real lexer will be used to // relex the body when parsing the expressions. We let it diagnose any // issues with malformed tokens or other problems. + unsigned CustomDelimiterLen = 0; switch (*CurPtr++) { // String literals in general cannot be split across multiple lines; // interpolated ones are no exception - unless multiline literals. @@ -1362,43 +1433,52 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // Will be diagnosed as an unterminated string literal. return CurPtr-1; + case '#': + if (inStringLiteral() || + !(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))) + continue; + LLVM_FALLTHROUGH; + case '"': case '\'': { if (!AllowNewline.back() && inStringLiteral()) { - if (OpenDelimiters.back() == CurPtr[-1]) { + if (OpenDelimiters.back() == CurPtr[-1] && + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) { // Closing single line string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); + CustomDelimiter.pop_back(); } // Otherwise, it's just a quote in string literal. e.g. "foo's". continue; } - bool isMultilineQuote = ( - *CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr - 1) == '"'); - if (isMultilineQuote) - CurPtr += 2; + bool isMultilineQuote = advanceIfMultilineDelimiter(CurPtr, Diags); if (!inStringLiteral()) { // Open string literal OpenDelimiters.push_back(CurPtr[-1]); AllowNewline.push_back(isMultilineQuote); + CustomDelimiter.push_back(CustomDelimiterLen); continue; } // We are in multiline string literal. assert(AllowNewline.back() && "other cases must be handled above"); - if (isMultilineQuote) { + if (isMultilineQuote && + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) { // Close multiline string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); + CustomDelimiter.pop_back(); } // Otherwise, it's just a normal character in multiline string. continue; } case '\\': - if (inStringLiteral()) { + if (inStringLiteral() && + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) { char escapedChar = *CurPtr++; switch (escapedChar) { case '(': @@ -1458,7 +1538,10 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, static StringRef getStringLiteralContent(const Token &Str) { StringRef Bytes = Str.getText(); - if (Str.IsMultilineString()) + if (unsigned CustomDelimiterLen = Str.getCustomDelimiterLen()) + Bytes = Bytes.drop_front(CustomDelimiterLen).drop_back(CustomDelimiterLen); + + if (Str.isMultilineString()) Bytes = Bytes.drop_front(3).drop_back(3); else Bytes = Bytes.drop_front().drop_back(); @@ -1496,7 +1579,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { auto string = StringRef(start, end - start); // Disallow escaped newline in the last line. - if (Diags) { + if (Diags && Str.getCustomDelimiterLen() == 0) { auto *Ptr = start - 1; if (*Ptr == '\n') --Ptr; if (*Ptr == '\r') --Ptr; @@ -1652,30 +1735,31 @@ static void validateMultilineIndents(const Token &Str, /// lexStringLiteral: /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] /// string_literal ::= ["]["]["].*["]["]["] - approximately -void Lexer::lexStringLiteral() { +/// string_literal ::= (#+)("")?".*"(\2\1) - "raw" strings +void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { const char *TokStart = CurPtr-1; assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); // NOTE: We only allow single-quote string literals so we can emit useful // diagnostics about changing them to double quotes. - bool wasErroneous = false, MultilineString = false; + bool wasErroneous = false, IsMultilineString = false; // Is this the start of a multiline string literal? - if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { - MultilineString = true; - CurPtr += 2; + if ((IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) { if (*CurPtr != '\n' && *CurPtr != '\r') diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); } while (true) { - if (*CurPtr == '\\' && *(CurPtr + 1) == '(') { + const char *TmpPtr = CurPtr + 1; + if (*CurPtr == '\\' && delimiterMatches(CustomDelimiterLen, TmpPtr, nullptr) + && *TmpPtr == '(') { // Consume tokens until we hit the corresponding ')'. - CurPtr += 2; + CurPtr = TmpPtr + 1; const char *EndPtr = skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, - Diags, MultilineString); + Diags, IsMultilineString); if (*EndPtr == ')') { // Successfully scanned the body of the expression literal. @@ -1688,21 +1772,21 @@ void Lexer::lexStringLiteral() { } // String literals cannot have \n or \r in them (unless multiline). - if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString) + if (((*CurPtr == '\r' || *CurPtr == '\n') && !IsMultilineString) || CurPtr == BufferEnd) { + TokStart -= CustomDelimiterLen; diagnose(TokStart, diag::lex_unterminated_string); return formToken(tok::unknown, TokStart); } - unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, MultilineString); + unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, + IsMultilineString, CustomDelimiterLen); wasErroneous |= CharValue == ~1U; // If this is the end of string, we are done. If it is a normal character // or an already-diagnosed error, just munch it. if (CharValue == ~0U) { ++CurPtr; - if (wasErroneous) - return formToken(tok::unknown, TokStart); if (*TokStart == '\'') { // Complain about single-quote string and suggest replacement with @@ -1738,20 +1822,19 @@ void Lexer::lexStringLiteral() { replacement); } - // Is this the end of a multiline string literal? - if (MultilineString) { - if (*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr + 2) != '"') { - CurPtr += 2; - formToken(tok::string_literal, TokStart, MultilineString); - if (Diags) - validateMultilineIndents(NextToken, Diags); - return; - } - else - continue; + // Is this the end of multiline/custom-delimited string literal? + if ((!IsMultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) && + delimiterMatches(CustomDelimiterLen, CurPtr, Diags, true)) { + TokStart -= CustomDelimiterLen; + if (wasErroneous) + return formToken(tok::unknown, TokStart); + + formToken(tok::string_literal, TokStart, + IsMultilineString, CustomDelimiterLen); + if (IsMultilineString && Diags) + validateMultilineIndents(NextToken, Diags); + return; } - - return formToken(tok::string_literal, TokStart, MultilineString); } } } @@ -2016,13 +2099,35 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, SmallVectorImpl &TempString, bool IsFirstSegment, bool IsLastSegment, - unsigned IndentToStrip) { + unsigned IndentToStrip, + unsigned CustomDelimiterLen) { TempString.clear(); // Note that it is always safe to read one over the end of "Bytes" because // we know that there is a terminating " character. Use BytesPtr to avoid a // range check subscripting on the StringRef. const char *BytesPtr = Bytes.begin(); + + // Special case when being called from EncodedDiagnosticMessage(...). + // This allows multiline and delimited strings to work in attributes. + // The string has already been validated by the initial parse. + if (IndentToStrip == ~0u && CustomDelimiterLen == ~0u) { + IndentToStrip = CustomDelimiterLen = 0; + + // Restore trailing indent removal for multiline. + const char *Backtrack = BytesPtr - 1; + if (Backtrack[-1] == '"' && Backtrack[-2] == '"') { + Backtrack -= 2; + for (const char *Trailing = Bytes.end() - 1; + *Trailing == ' ' || *Trailing == '\t'; Trailing--) + IndentToStrip++; + } + + // Restore delimiter if any. + while (*--Backtrack == '#') + CustomDelimiterLen++; + } + bool IsEscapedNewline = false; while (BytesPtr < Bytes.end()) { char CurChar = *BytesPtr++; @@ -2043,7 +2148,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, continue; } - if (CurChar != '\\') { + if (CurChar != '\\' || + !delimiterMatches(CustomDelimiterLen, BytesPtr, nullptr)) { TempString.push_back(CurChar); continue; } @@ -2113,8 +2219,8 @@ void Lexer::getStringLiteralSegments( // Are substitutions required either for indent stripping or line ending // normalization? - bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true; - unsigned IndentToStrip = 0; + bool MultilineString = Str.isMultilineString(), IsFirstSegment = true; + unsigned IndentToStrip = 0, CustomDelimiterLen = Str.getCustomDelimiterLen(); if (MultilineString) IndentToStrip = std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size(); @@ -2124,13 +2230,12 @@ void Lexer::getStringLiteralSegments( // range check subscripting on the StringRef. const char *SegmentStartPtr = Bytes.begin(); const char *BytesPtr = SegmentStartPtr; - // FIXME: Use SSE to scan for '\'. - while (BytesPtr != Bytes.end()) { - char CurChar = *BytesPtr++; - if (CurChar != '\\') - continue; + size_t pos; + while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) { + BytesPtr = Bytes.begin() + pos + 1; - if (*BytesPtr++ != '(') + if (!delimiterMatches(CustomDelimiterLen, BytesPtr, Diags) || + *BytesPtr++ != '(') continue; // String interpolation. @@ -2138,8 +2243,9 @@ void Lexer::getStringLiteralSegments( // Push the current segment. Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), - BytesPtr-SegmentStartPtr-2, - IsFirstSegment, false, IndentToStrip)); + BytesPtr-SegmentStartPtr-2-CustomDelimiterLen, + IsFirstSegment, false, IndentToStrip, + CustomDelimiterLen)); IsFirstSegment = false; // Find the closing ')'. @@ -2162,7 +2268,8 @@ void Lexer::getStringLiteralSegments( Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), Bytes.end()-SegmentStartPtr, - IsFirstSegment, true, IndentToStrip)); + IsFirstSegment, true, IndentToStrip, + CustomDelimiterLen)); } @@ -2261,6 +2368,8 @@ void Lexer::lexImpl() { case '\\': return formToken(tok::backslash, TokStart); case '#': + if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags)) + return lexStringLiteral(CustomDelimiterLen); return lexHash(); // Operator characters. diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp index a8501d3433155..c5bbd6a8b9269 100644 --- a/lib/Parse/ParseExpr.cpp +++ b/lib/Parse/ParseExpr.cpp @@ -1923,7 +1923,7 @@ ParserResult Parser::parseExprStringLiteral() { LocalContext.setCreateSyntax(SyntaxKind::StringInterpolationExpr); StringRef Quote; tok QuoteKind; - std::tie(Quote, QuoteKind) = Tok.IsMultilineString() ? + std::tie(Quote, QuoteKind) = Tok.isMultilineString() ? std::make_tuple("\"\"\"", tok::multiline_string_quote) : std::make_tuple("\"", tok::string_quote); diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp index 8051798a9bfaa..41d7dceb171e1 100644 --- a/lib/Parse/Parser.cpp +++ b/lib/Parse/Parser.cpp @@ -216,8 +216,9 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, const SourceManager &SM, int BufID, std::vector &Toks) { assert(Tok.is(tok::string_literal)); - bool IsMultiline = Tok.IsMultilineString(); - unsigned QuoteLen = IsMultiline ? 3 : 1; + bool IsMultiline = Tok.isMultilineString(); + unsigned CustomDelimiterLen = Tok.getCustomDelimiterLen(); + unsigned QuoteLen = (IsMultiline ? 3 : 1) + CustomDelimiterLen; SmallVector Segments; Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr); for (unsigned i = 0, e = Segments.size(); i != e; ++i) { @@ -239,7 +240,8 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, StringRef Text = SM.extractText({ Loc, Len }); Token NewTok; - NewTok.setToken(tok::string_literal, Text, IsMultiline); + NewTok.setToken(tok::string_literal, Text, + IsMultiline, CustomDelimiterLen); Toks.push_back(NewTok); } else { @@ -372,7 +374,7 @@ class TokenRecorder: public ConsumeTokenReceiver { } void relexComment(CharSourceRange CommentRange, - llvm::SmallVectorImpl &Scracth) { + llvm::SmallVectorImpl &Scratch) { Lexer L(Ctx.LangOpts, Ctx.SourceMgr, BufferID, nullptr, /*InSILMode=*/false, HashbangMode::Disallowed, CommentRetentionMode::ReturnAsTokens, @@ -385,7 +387,7 @@ class TokenRecorder: public ConsumeTokenReceiver { if (Result.is(tok::eof)) break; assert(Result.is(tok::comment)); - Scracth.push_back(Result); + Scratch.push_back(Result); } } diff --git a/lib/Sema/TypeChecker.h b/lib/Sema/TypeChecker.h index 5e55ef3344b55..7d64ecad386f9 100644 --- a/lib/Sema/TypeChecker.h +++ b/lib/Sema/TypeChecker.h @@ -2164,7 +2164,7 @@ class EncodedDiagnosticMessage { public: /// \param S A string with an encoded message EncodedDiagnosticMessage(StringRef S) - : Message(Lexer::getEncodedStringSegment(S, Buf)) {} + : Message(Lexer::getEncodedStringSegment(S, Buf, true, true, ~0, ~0)) {} /// The unescaped message to display to the user. const StringRef Message; diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift new file mode 100644 index 0000000000000..953b92324b041 --- /dev/null +++ b/test/Parse/raw_string.swift @@ -0,0 +1,134 @@ +// RUN: %target-swift-frontend -dump-ast %s 2>&1 | %FileCheck --strict-whitespace %s + +import Swift + +_ = #""" +################################################################### +## This source file is part of the Swift.org open source project ## +################################################################### +"""# +// CHECK: "###################################################################\n## This source file is part of the Swift.org open source project ##\n###################################################################" + +_ = #""" + # H1 # + ## H2 ## + ### H3 ### + """# +// CHECK: "# H1 #\n## H2 ##\n### H3 ###" + +// ===---------- Multiline RawString --------=== + +_ = ##""" + One + ""Alpha"" + """## +// CHECK: "One\n\"\"Alpha\"\"" + +_ = ##""" + Two + Beta + """## +// CHECK: " Two\nBeta" + +_ = #""" + Three\r + Gamma\ + """# +// CHECK: " Three\\r\n Gamma\\" + +_ = ###""" + Four \(foo) + Delta +"""### +// CHECK: " Four \\(foo)\n Delta" + +_ = ##""" + print(""" + Five\##n\##n\##nEpsilon + """) + """## +// CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")" + +// ===---------- Single line --------=== + +_ = #""Zeta""# +// CHECK: "\"Zeta\"" + +_ = #""Eta"\#n\#n\#n\#""# +// CHECK: "\"Eta\"\n\n\n\"" + +_ = #""Iota"\n\n\n\""# +// CHECK: "\"Iota\"\\n\\n\\n\\\"" + +_ = #"a raw string with \" in it"# +// CHECK: "a raw string with \\\" in it" + +_ = ##""" + a raw string with """ in it + """## +// CHECK: "a raw string with \"\"\" in it" + +let foo = "Interpolation" +_ = #"\b\b \#(foo)\#(foo) Kappa"# +// CHECK: "\\b\\b " +// CHECK: " Kappa" + +_ = """ + interpolating \(##""" + delimited \##("string")\#n\##n + """##) + """ + +// CHECK: "interpolating " +// CHECK: "delimited " +// CHECK: "string" +// CHECK: "\\#n\n" + +#"unused literal"# +// CHECK: "unused literal" + +// ===---------- From proposal --------=== + +_ = #"This is a string"# +// CHECK: "This is a string" + +_ = #####"This is a string"##### +// CHECK: "This is a string" + +_ = #"enum\s+.+\{.*case\s+[:upper:]"# +// CHECK: "enum\\s+.+\\{.*case\\s+[:upper:]" + +_ = #"Alice: "How long is forever?" White Rabbit: "Sometimes, just one second.""# +// CHECK: "Alice: \"How long is forever?\" White Rabbit: \"Sometimes, just one second.\"" + +_ = #"\#\#1"# +// CHECK: "\\#1" + +_ = ##"\#1"## +// CHECK: "\\#1" + +_ = #"c:\windows\system32"# +// CHECK: "c:\\windows\\system32" + +_ = #"\d{3) \d{3} \d{4}"# +// CHECK: "\\d{3) \\d{3} \\d{4}" + +_ = #""" + a string with + """ + in it + """# +// CHECK: "a string with\n\"\"\"\nin it" + +_ = #"a raw string containing \r\n"# +// CHECK: "a raw string containing \\r\\n" + +_ = #""" + [ + { + "id": "12345", + "title": "A title that \"contains\" \\\"" + } + ] + """# +// CHECK: "[\n {\n \"id\": \"12345\",\n \"title\": \"A title that \\\"contains\\\" \\\\\\\"\"\n }\n]" diff --git a/test/Parse/raw_string_errors.swift b/test/Parse/raw_string_errors.swift new file mode 100644 index 0000000000000..ca19b41fe4dd2 --- /dev/null +++ b/test/Parse/raw_string_errors.swift @@ -0,0 +1,14 @@ +// RUN: %target-typecheck-verify-swift + +#"\##("invalid")"# +// expected-error@-1{{too many '#' characters in delimited escape}} +// expected-error@-2{{invalid escape sequence in literal}} + +####"invalid"### +// expected-error@-1{{unterminated string literal}} + +###"invalid"#### +// expected-error@-1{{too many '#' characters in closing delimiter}} +// expected-error@-2{{consecutive statements on a line must be separated by ';'}} +// expected-error@-3{{expected expression}} +// expected-warning@-4{{string literal is unused}}