diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def index 17c21892eb7ff..dd1980e92b82f 100644 --- a/include/swift/AST/DiagnosticsParse.def +++ b/include/swift/AST/DiagnosticsParse.def @@ -138,6 +138,10 @@ ERROR(lex_invalid_u_escape,none, "\\u{...} escape sequence expects between 1 and 8 hex digits", ()) ERROR(lex_invalid_u_escape_rbrace,none, "expected '}' in \\u{...} escape sequence", ()) +ERROR(lex_invalid_escape_delimiter,none, + "too many '#' characters in delimited escape", ()) +ERROR(lex_invalid_closing_delimiter,none, + "too many '#' characters in closing delimiter", ()) ERROR(lex_invalid_unicode_scalar,none, "invalid unicode scalar", ()) @@ -1302,7 +1306,9 @@ ERROR(swift_native_objc_runtime_base_must_be_identifier,none, "@_swift_native_objc_runtime_base class name must be an identifier", ()) ERROR(attr_interpolated_string,none, -"%0 cannot be an interpolated string literal", (StringRef)) +"'%0' cannot be an interpolated string literal", (StringRef)) +ERROR(attr_extended_escaping_string,none, +"'%0' cannot be an extended escaping string literal", (StringRef)) ERROR(attr_only_at_non_local_scope, none, "attribute '%0' can only be used in a non-local scope", (StringRef)) diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h index 876170b929c26..3fbb2fe885f52 100644 --- a/include/swift/Parse/Lexer.h +++ b/include/swift/Parse/Lexer.h @@ -346,12 +346,13 @@ class Lexer { enum : char { Literal, Expr } Kind; // Loc+Length for the segment inside the string literal, without quotes. SourceLoc Loc; - unsigned Length, IndentToStrip; + unsigned Length, IndentToStrip, CustomDelimiterLen; bool IsFirstSegment, IsLastSegment; static StringSegment getLiteral(SourceLoc Loc, unsigned Length, bool IsFirstSegment, bool IsLastSegment, - unsigned IndentToStrip) { + unsigned IndentToStrip, + unsigned CustomDelimiterLen) { StringSegment Result; Result.Kind = Literal; Result.Loc = Loc; @@ -359,6 +360,7 @@ class Lexer { Result.IsFirstSegment = IsFirstSegment; Result.IsLastSegment = IsLastSegment; Result.IndentToStrip = IndentToStrip; + Result.CustomDelimiterLen = CustomDelimiterLen; return Result; } @@ -370,6 +372,7 @@ class Lexer { Result.IsFirstSegment = false; Result.IsLastSegment = false; Result.IndentToStrip = 0; + Result.CustomDelimiterLen = 0; return Result; } @@ -378,21 +381,50 @@ class Lexer { } }; - + + /// Implementation of getEncodedStringSegment. Note that \p Str must support + /// reading one byte past the end. + static StringRef getEncodedStringSegmentImpl(StringRef Str, + SmallVectorImpl &Buffer, + bool IsFirstSegment, + bool IsLastSegment, + unsigned IndentToStrip, + unsigned CustomDelimiterLen); + /// \brief Compute the bytes that the actual string literal should codegen to. /// If a copy needs to be made, it will be allocated out of the provided - /// Buffer. - static StringRef getEncodedStringSegment(StringRef Str, - SmallVectorImpl &Buffer, - bool IsFirstSegment = false, - bool IsLastSegment = false, - unsigned IndentToStrip = 0); + /// \p Buffer. StringRef getEncodedStringSegment(StringSegment Segment, SmallVectorImpl &Buffer) const { - return getEncodedStringSegment( + return getEncodedStringSegmentImpl( StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length), Buffer, Segment.IsFirstSegment, Segment.IsLastSegment, - Segment.IndentToStrip); + Segment.IndentToStrip, Segment.CustomDelimiterLen); + } + + /// \brief Given a string encoded with escapes like a string literal, compute + /// the byte content. + /// + /// If a copy needs to be made, it will be allocated out of the provided + /// \p Buffer. + static StringRef getEncodedStringSegment(StringRef Str, + SmallVectorImpl &Buffer, + bool IsFirstSegment = false, + bool IsLastSegment = false, + unsigned IndentToStrip = 0, + unsigned CustomDelimiterLen = 0) { + SmallString<128> TerminatedStrBuf(Str); + TerminatedStrBuf.push_back('\0'); + StringRef TerminatedStr = StringRef(TerminatedStrBuf).drop_back(); + StringRef Result = getEncodedStringSegmentImpl(TerminatedStr, Buffer, + IsFirstSegment, + IsLastSegment, + IndentToStrip, + CustomDelimiterLen); + if (Result == TerminatedStr) + return Str; + assert(Result.data() == Buffer.data()); + return Result; } /// \brief Given a string literal token, separate it into string/expr segments @@ -456,7 +488,8 @@ class Lexer { return diagnose(Loc, Diagnostic(DiagID, std::forward(Args)...)); } - void formToken(tok Kind, const char *TokStart, bool MultilineString = false); + void formToken(tok Kind, const char *TokStart, bool IsMultilineString = false, + unsigned CustomDelimiterLen = 0); void formEscapedIdentifierToken(const char *TokStart); /// Advance to the end of the line. @@ -480,10 +513,10 @@ class Lexer { void lexTrivia(syntax::Trivia &T, bool IsForTrailingTrivia); static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags); - unsigned lexCharacter(const char *&CurPtr, - char StopQuote, bool EmitDiagnostics, - bool MultilineString = false); - void lexStringLiteral(); + unsigned lexCharacter(const char *&CurPtr, char StopQuote, + bool EmitDiagnostics, bool IsMultilineString = false, + unsigned CustomDelimiterLen = 0); + void lexStringLiteral(unsigned CustomDelimiterLen = 0); void lexEscapedIdentifier(); void tryLexEditorPlaceholder(); diff --git a/include/swift/Parse/Token.h b/include/swift/Parse/Token.h index 9d1a99a179751..6cd0fc95828c4 100644 --- a/include/swift/Parse/Token.h +++ b/include/swift/Parse/Token.h @@ -45,7 +45,10 @@ class Token { /// Modifiers for string literals unsigned MultilineString : 1; - // Padding bits == 32 - sizeof(Kind) * 8 - 3; + /// Length of custom delimiter of "raw" string literals + unsigned CustomDelimiterLen : 8; + + // Padding bits == 32 - 11; /// \brief The length of the comment that precedes the token. unsigned CommentLength; @@ -62,8 +65,8 @@ class Token { public: Token(tok Kind, StringRef Text, unsigned CommentLength = 0) : Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false), - MultilineString(false), CommentLength(CommentLength), - Text(Text) {} + MultilineString(false), CustomDelimiterLen(0), + CommentLength(CommentLength), Text(Text) {} Token() : Token(tok::NUM_TOKENS, {}, 0) {} @@ -266,17 +269,24 @@ class Token { /// \brief Set the token to the specified kind and source range. void setToken(tok K, StringRef T, unsigned CommentLength = 0, - bool MultilineString = false) { + bool IsMultilineString = false, unsigned CustomDelimiterLen = 0) { Kind = K; Text = T; this->CommentLength = CommentLength; EscapedIdentifier = false; - this->MultilineString = MultilineString; + this->MultilineString = IsMultilineString; + this->CustomDelimiterLen = CustomDelimiterLen; + assert(this->CustomDelimiterLen == CustomDelimiterLen && + "custom string delimiter length > 255"); } - bool IsMultilineString() const { + bool isMultilineString() const { return MultilineString; } + + unsigned getCustomDelimiterLen() const { + return CustomDelimiterLen; + } }; } // end namespace swift diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 659c6bba24b02..4cc1bceb90e04 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -267,7 +267,8 @@ Token Lexer::getTokenAt(SourceLoc Loc) { return Result; } -void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) { +void Lexer::formToken(tok Kind, const char *TokStart, + bool IsMultilineString, unsigned CustomDelimiterLen) { assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Current pointer out of range!"); @@ -299,7 +300,8 @@ void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) { lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true); } - NextToken.setToken(Kind, TokenText, CommentLength, MultilineString); + NextToken.setToken(Kind, TokenText, CommentLength, + IsMultilineString, CustomDelimiterLen); } void Lexer::formEscapedIdentifierToken(const char *TokStart) { @@ -1210,6 +1212,69 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } +/// diagnoseZeroWidthMatchAndAdvance - Error invisible characters in delimiters. +/// An invisible character in the middle of a delimiter can be used to extend +/// the literal beyond what it would appear creating potential security bugs. +static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr, + DiagnosticEngine *Diags) { + // TODO: Detect, diagnose and skip over zero-width characters if required. + // See https://github.com/apple/swift/pull/17668 for possible implementation. + return *CurPtr == Target && CurPtr++; +} + +/// advanceIfMultilineDelimiter - Centralized check for multiline delimiter. +static bool advanceIfMultilineDelimiter(const char *&CurPtr, + DiagnosticEngine *Diags) { + const char *TmpPtr = CurPtr; + if (*(TmpPtr - 1) == '"' && + diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags) && + diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) { + CurPtr = TmpPtr; + return true; + } + return false; +} + +/// advanceIfCustomDelimiter - Extracts/detects any custom delimiter on +/// opening a string literal, advances CurPtr if a delimiter is found and +/// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called. +static unsigned advanceIfCustomDelimiter(const char *&CurPtr, + DiagnosticEngine *Diags) { + const char *TmpPtr = CurPtr; + unsigned CustomDelimiterLen = 1; + while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) + CustomDelimiterLen++; + if (diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) { + CurPtr = TmpPtr; + return CustomDelimiterLen; + } + return 0; +} + +/// delimiterMatches - Does custom delimiter ('#' characters surrounding quotes) +/// match the number of '#' characters after '\' inside the string? This allows +/// interpolation inside a "raw" string. Normal/cooked string processing is +/// the degenerate case of there being no '#' characters surrounding the quotes. +/// If delimiter matches, advances byte pointer passed in and returns true. +/// Also used to detect the final delimiter of a string when IsClosing == true. +static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr, + DiagnosticEngine *Diags, bool IsClosing = false) { + if (!CustomDelimiterLen) + return true; + const char *TmpPtr = BytesPtr; + while (CustomDelimiterLen--) + if (!diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) + return false; + BytesPtr = TmpPtr; + if (*BytesPtr == '#' && Diags) + Diags->diagnose(Lexer::getSourceLoc(BytesPtr), IsClosing ? + diag::lex_invalid_closing_delimiter : + diag::lex_invalid_escape_delimiter) + .fixItRemoveChars(Lexer::getSourceLoc(BytesPtr), + Lexer::getSourceLoc(BytesPtr + 1)); + return true; +} + /// lexCharacter - Read a character and return its UTF32 code. If this is the /// end of enclosing string/character sequence (i.e. the character is equal to /// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal @@ -1219,7 +1284,8 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { /// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0 /// character_escape ::= unicode_character_escape unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, - bool EmitDiagnostics, bool MultilineString) { + bool EmitDiagnostics, bool IsMultilineString, + unsigned CustomDelimiterLen) { const char *CharStart = CurPtr; switch (*CurPtr++) { @@ -1227,7 +1293,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) >= 0) { if (isPrintable(CurPtr[-1]) == 0) - if (!(MultilineString && (CurPtr[-1] == '\t'))) + if (!(IsMultilineString && (CurPtr[-1] == '\t'))) if (EmitDiagnostics) diagnose(CharStart, diag::lex_unprintable_ascii_character); return CurPtr[-1]; @@ -1262,12 +1328,15 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, return ~1U; case '\n': // String literals cannot have \n or \r in them. case '\r': - if (MultilineString) // ... unless they are multiline + if (IsMultilineString) // ... unless they are multiline return CurPtr[-1]; if (EmitDiagnostics) diagnose(CurPtr-1, diag::lex_unterminated_string); return ~1U; case '\\': // Escapes. + if (!delimiterMatches(CustomDelimiterLen, CurPtr, + EmitDiagnostics ? Diags : nullptr)) + return '\\'; break; } @@ -1275,7 +1344,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, // Escape processing. We already ate the "\". switch (*CurPtr) { case ' ': case '\t': case '\n': case '\r': - if (MultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) + if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) return '\n'; LLVM_FALLTHROUGH; default: // Invalid escape. @@ -1333,10 +1402,11 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, const char *EndPtr, DiagnosticEngine *Diags, - bool MultilineString) { - llvm::SmallVector OpenDelimiters; - llvm::SmallVector AllowNewline; - AllowNewline.push_back(MultilineString); + bool IsMultilineString) { + SmallVector OpenDelimiters; + SmallVector AllowNewline; + SmallVector CustomDelimiter; + AllowNewline.push_back(IsMultilineString); auto inStringLiteral = [&]() { return !OpenDelimiters.empty() && @@ -1351,6 +1421,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // On success scanning the expression body, the real lexer will be used to // relex the body when parsing the expressions. We let it diagnose any // issues with malformed tokens or other problems. + unsigned CustomDelimiterLen = 0; switch (*CurPtr++) { // String literals in general cannot be split across multiple lines; // interpolated ones are no exception - unless multiline literals. @@ -1361,43 +1432,52 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // Will be diagnosed as an unterminated string literal. return CurPtr-1; + case '#': + if (inStringLiteral() || + !(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))) + continue; + LLVM_FALLTHROUGH; + case '"': case '\'': { if (!AllowNewline.back() && inStringLiteral()) { - if (OpenDelimiters.back() == CurPtr[-1]) { + if (OpenDelimiters.back() == CurPtr[-1] && + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) { // Closing single line string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); + CustomDelimiter.pop_back(); } // Otherwise, it's just a quote in string literal. e.g. "foo's". continue; } - bool isMultilineQuote = ( - *CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr - 1) == '"'); - if (isMultilineQuote) - CurPtr += 2; + bool isMultilineQuote = advanceIfMultilineDelimiter(CurPtr, Diags); if (!inStringLiteral()) { // Open string literal OpenDelimiters.push_back(CurPtr[-1]); AllowNewline.push_back(isMultilineQuote); + CustomDelimiter.push_back(CustomDelimiterLen); continue; } // We are in multiline string literal. assert(AllowNewline.back() && "other cases must be handled above"); - if (isMultilineQuote) { + if (isMultilineQuote && + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) { // Close multiline string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); + CustomDelimiter.pop_back(); } // Otherwise, it's just a normal character in multiline string. continue; } case '\\': - if (inStringLiteral()) { + if (inStringLiteral() && + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) { char escapedChar = *CurPtr++; switch (escapedChar) { case '(': @@ -1457,7 +1537,10 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, static StringRef getStringLiteralContent(const Token &Str) { StringRef Bytes = Str.getText(); - if (Str.IsMultilineString()) + if (unsigned CustomDelimiterLen = Str.getCustomDelimiterLen()) + Bytes = Bytes.drop_front(CustomDelimiterLen).drop_back(CustomDelimiterLen); + + if (Str.isMultilineString()) Bytes = Bytes.drop_front(3).drop_back(3); else Bytes = Bytes.drop_front().drop_back(); @@ -1476,9 +1559,9 @@ static size_t commonPrefixLength(StringRef shorter, StringRef longer) { /// getMultilineTrailingIndent: /// Determine trailing indent to be used for multiline literal indent stripping. -static std::tuple -getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { - StringRef Bytes = getStringLiteralContent(Str); +StringRef +getMultilineTrailingIndent(StringRef Bytes, DiagnosticEngine *Diags = nullptr, + unsigned CustomDelimiterLen = 0) { const char *begin = Bytes.begin(), *end = Bytes.end(), *start = end; bool sawNonWhitespace = false; @@ -1491,11 +1574,9 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { case '\n': case '\r': { ++start; - auto startLoc = Lexer::getSourceLoc(start); - auto string = StringRef(start, end - start); // Disallow escaped newline in the last line. - if (Diags) { + if (Diags && !CustomDelimiterLen) { auto *Ptr = start - 1; if (*Ptr == '\n') --Ptr; if (*Ptr == '\r') --Ptr; @@ -1511,7 +1592,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { } } - return std::make_tuple(string, startLoc); + return StringRef(start, end - start); } default: sawNonWhitespace = true; @@ -1525,7 +1606,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { .fixItInsert(loc, "\n"); } - return std::make_tuple("", Lexer::getSourceLoc(end - 1)); + return ""; } /// diagnoseInvalidMultilineIndents: @@ -1589,12 +1670,13 @@ static void diagnoseInvalidMultilineIndents( /// Diagnose contents of string literal that have inconsistent indentation. static void validateMultilineIndents(const Token &Str, DiagnosticEngine *Diags) { - StringRef Indent; - SourceLoc IndentStartLoc; - std::tie(Indent, IndentStartLoc) = getMultilineTrailingIndent(Str, Diags); + StringRef Bytes = getStringLiteralContent(Str); + StringRef Indent = + getMultilineTrailingIndent(Bytes, Diags, Str.getCustomDelimiterLen()); if (Indent.empty()) return; - + SourceLoc IndentStartLoc = Lexer::getSourceLoc(Indent.data()); + // The offset into the previous line where it experienced its first indentation // error, or Indent.size() if every character matched. size_t lastMistakeOffset = std::numeric_limits::max(); @@ -1604,7 +1686,6 @@ static void validateMultilineIndents(const Token &Str, // Prefix of indentation that's present on all lines in linesWithLastMatchLength. StringRef commonIndentation = ""; - StringRef Bytes = getStringLiteralContent(Str); for (size_t pos = Bytes.find('\n'); pos != StringRef::npos; pos = Bytes.find('\n', pos + 1)) { size_t nextpos = pos + 1; auto restOfBytes = Bytes.substr(nextpos); @@ -1651,30 +1732,31 @@ static void validateMultilineIndents(const Token &Str, /// lexStringLiteral: /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] /// string_literal ::= ["]["]["].*["]["]["] - approximately -void Lexer::lexStringLiteral() { +/// string_literal ::= (#+)("")?".*"(\2\1) - "raw" strings +void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { const char *TokStart = CurPtr-1; assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); // NOTE: We only allow single-quote string literals so we can emit useful // diagnostics about changing them to double quotes. - bool wasErroneous = false, MultilineString = false; + bool wasErroneous = false, IsMultilineString = false; // Is this the start of a multiline string literal? - if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { - MultilineString = true; - CurPtr += 2; + if ((IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) { if (*CurPtr != '\n' && *CurPtr != '\r') diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); } while (true) { - if (*CurPtr == '\\' && *(CurPtr + 1) == '(') { + const char *TmpPtr = CurPtr + 1; + if (*CurPtr == '\\' && delimiterMatches(CustomDelimiterLen, TmpPtr, nullptr) + && *TmpPtr == '(') { // Consume tokens until we hit the corresponding ')'. - CurPtr += 2; + CurPtr = TmpPtr + 1; const char *EndPtr = skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, - Diags, MultilineString); + Diags, IsMultilineString); if (*EndPtr == ')') { // Successfully scanned the body of the expression literal. @@ -1687,21 +1769,21 @@ void Lexer::lexStringLiteral() { } // String literals cannot have \n or \r in them (unless multiline). - if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString) + if (((*CurPtr == '\r' || *CurPtr == '\n') && !IsMultilineString) || CurPtr == BufferEnd) { + TokStart -= CustomDelimiterLen; diagnose(TokStart, diag::lex_unterminated_string); return formToken(tok::unknown, TokStart); } - unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, MultilineString); + unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, + IsMultilineString, CustomDelimiterLen); wasErroneous |= CharValue == ~1U; // If this is the end of string, we are done. If it is a normal character // or an already-diagnosed error, just munch it. if (CharValue == ~0U) { ++CurPtr; - if (wasErroneous) - return formToken(tok::unknown, TokStart); if (*TokStart == '\'') { // Complain about single-quote string and suggest replacement with @@ -1737,20 +1819,19 @@ void Lexer::lexStringLiteral() { replacement); } - // Is this the end of a multiline string literal? - if (MultilineString) { - if (*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr + 2) != '"') { - CurPtr += 2; - formToken(tok::string_literal, TokStart, MultilineString); - if (Diags) - validateMultilineIndents(NextToken, Diags); - return; - } - else - continue; + // Is this the end of multiline/custom-delimited string literal? + if ((!IsMultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) && + delimiterMatches(CustomDelimiterLen, CurPtr, Diags, true)) { + TokStart -= CustomDelimiterLen; + if (wasErroneous) + return formToken(tok::unknown, TokStart); + + formToken(tok::string_literal, TokStart, + IsMultilineString, CustomDelimiterLen); + if (IsMultilineString && Diags) + validateMultilineIndents(NextToken, Diags); + return; } - - return formToken(tok::string_literal, TokStart, MultilineString); } } } @@ -2011,17 +2092,25 @@ void Lexer::tryLexEditorPlaceholder() { lexOperatorIdentifier(); } -StringRef Lexer::getEncodedStringSegment(StringRef Bytes, - SmallVectorImpl &TempString, - bool IsFirstSegment, - bool IsLastSegment, - unsigned IndentToStrip) { +StringRef Lexer::getEncodedStringSegmentImpl(StringRef Bytes, + SmallVectorImpl &TempString, + bool IsFirstSegment, + bool IsLastSegment, + unsigned IndentToStrip, + unsigned CustomDelimiterLen) { TempString.clear(); - // Note that it is always safe to read one over the end of "Bytes" because - // we know that there is a terminating " character. Use BytesPtr to avoid a - // range check subscripting on the StringRef. + // Note that it is always safe to read one over the end of "Bytes" because we + // know that there is a terminating " character (or null byte for an + // unterminated literal or a segment that doesn't come from source). Use + // BytesPtr to avoid a range check subscripting on the StringRef. const char *BytesPtr = Bytes.begin(); + + // Special case when being called from EncodedDiagnosticMessage(...) + // This should allow multiline strings to work as attribute messages. + if (IndentToStrip == ~0U) + IndentToStrip = getMultilineTrailingIndent(Bytes).size(); + bool IsEscapedNewline = false; while (BytesPtr < Bytes.end()) { char CurChar = *BytesPtr++; @@ -2042,7 +2131,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, continue; } - if (CurChar != '\\') { + if (CurChar != '\\' || + !delimiterMatches(CustomDelimiterLen, BytesPtr, nullptr)) { TempString.push_back(CurChar); continue; } @@ -2112,24 +2202,22 @@ void Lexer::getStringLiteralSegments( // Are substitutions required either for indent stripping or line ending // normalization? - bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true; - unsigned IndentToStrip = 0; + bool MultilineString = Str.isMultilineString(), IsFirstSegment = true; + unsigned IndentToStrip = 0, CustomDelimiterLen = Str.getCustomDelimiterLen(); if (MultilineString) - IndentToStrip = - std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size(); + IndentToStrip = getMultilineTrailingIndent(Bytes).size(); // Note that it is always safe to read one over the end of "Bytes" because // we know that there is a terminating " character. Use BytesPtr to avoid a // range check subscripting on the StringRef. const char *SegmentStartPtr = Bytes.begin(); const char *BytesPtr = SegmentStartPtr; - // FIXME: Use SSE to scan for '\'. - while (BytesPtr != Bytes.end()) { - char CurChar = *BytesPtr++; - if (CurChar != '\\') - continue; + size_t pos; + while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) { + BytesPtr = Bytes.begin() + pos + 1; - if (*BytesPtr++ != '(') + if (!delimiterMatches(CustomDelimiterLen, BytesPtr, Diags) || + *BytesPtr++ != '(') continue; // String interpolation. @@ -2137,8 +2225,9 @@ void Lexer::getStringLiteralSegments( // Push the current segment. Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), - BytesPtr-SegmentStartPtr-2, - IsFirstSegment, false, IndentToStrip)); + BytesPtr-SegmentStartPtr-2-CustomDelimiterLen, + IsFirstSegment, false, IndentToStrip, + CustomDelimiterLen)); IsFirstSegment = false; // Find the closing ')'. @@ -2161,7 +2250,8 @@ void Lexer::getStringLiteralSegments( Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), Bytes.end()-SegmentStartPtr, - IsFirstSegment, true, IndentToStrip)); + IsFirstSegment, true, IndentToStrip, + CustomDelimiterLen)); } @@ -2259,6 +2349,8 @@ void Lexer::lexImpl() { case '\\': return formToken(tok::backslash, TokStart); case '#': + if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags)) + return lexStringLiteral(CustomDelimiterLen); return lexHash(); // Operator characters. diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp index e3790a117191a..6f53cde7fcc1b 100644 --- a/lib/Parse/ParseDecl.cpp +++ b/lib/Parse/ParseDecl.cpp @@ -288,6 +288,12 @@ bool Parser::parseTopLevel() { static Optional getStringLiteralIfNotInterpolated(Parser &P, SourceLoc Loc, const Token &Tok, StringRef DiagText) { + // FIXME: Support extended escaping string literal. + if (Tok.getCustomDelimiterLen()) { + P.diagnose(Loc, diag::attr_extended_escaping_string, DiagText); + return None; + } + SmallVector Segments; P.L->getStringLiteralSegments(Tok, Segments); if (Segments.size() != 1 || diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp index 2d3e5f3c717eb..998cda87293bc 100644 --- a/lib/Parse/ParseExpr.cpp +++ b/lib/Parse/ParseExpr.cpp @@ -1935,7 +1935,7 @@ ParserResult Parser::parseExprStringLiteral() { LocalContext.setCreateSyntax(SyntaxKind::StringInterpolationExpr); StringRef Quote; tok QuoteKind; - std::tie(Quote, QuoteKind) = Tok.IsMultilineString() ? + std::tie(Quote, QuoteKind) = Tok.isMultilineString() ? std::make_tuple("\"\"\"", tok::multiline_string_quote) : std::make_tuple("\"", tok::string_quote); diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp index 2dade47014648..482f505442e48 100644 --- a/lib/Parse/Parser.cpp +++ b/lib/Parse/Parser.cpp @@ -219,8 +219,9 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, const SourceManager &SM, int BufID, std::vector &Toks) { assert(Tok.is(tok::string_literal)); - bool IsMultiline = Tok.IsMultilineString(); - unsigned QuoteLen = IsMultiline ? 3 : 1; + bool IsMultiline = Tok.isMultilineString(); + unsigned CustomDelimiterLen = Tok.getCustomDelimiterLen(); + unsigned QuoteLen = (IsMultiline ? 3 : 1) + CustomDelimiterLen; SmallVector Segments; Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr); for (unsigned i = 0, e = Segments.size(); i != e; ++i) { @@ -242,7 +243,8 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, StringRef Text = SM.extractText({ Loc, Len }); Token NewTok; - NewTok.setToken(tok::string_literal, Text, IsMultiline); + NewTok.setToken(tok::string_literal, Text, + IsMultiline, CustomDelimiterLen); Toks.push_back(NewTok); } else { @@ -372,7 +374,7 @@ class TokenRecorder: public ConsumeTokenReceiver { } void relexComment(CharSourceRange CommentRange, - llvm::SmallVectorImpl &Scracth) { + llvm::SmallVectorImpl &Scratch) { Lexer L(Ctx.LangOpts, Ctx.SourceMgr, BufferID, nullptr, /*InSILMode=*/false, CommentRetentionMode::ReturnAsTokens, TriviaRetentionMode::WithoutTrivia, @@ -384,7 +386,7 @@ class TokenRecorder: public ConsumeTokenReceiver { if (Result.is(tok::eof)) break; assert(Result.is(tok::comment)); - Scracth.push_back(Result); + Scratch.push_back(Result); } } diff --git a/lib/Sema/TypeChecker.h b/lib/Sema/TypeChecker.h index bb4a7d706cae3..b457856b3b2ce 100644 --- a/lib/Sema/TypeChecker.h +++ b/lib/Sema/TypeChecker.h @@ -2573,7 +2573,7 @@ class EncodedDiagnosticMessage { public: /// \param S A string with an encoded message EncodedDiagnosticMessage(StringRef S) - : Message(Lexer::getEncodedStringSegment(S, Buf)) {} + : Message(Lexer::getEncodedStringSegment(S, Buf, true, true, ~0U)) {} /// The unescaped message to display to the user. const StringRef Message; diff --git a/test/Parse/diagnose_availability.swift b/test/Parse/diagnose_availability.swift index 506bce9f46e22..2cd5774c34323 100644 --- a/test/Parse/diagnose_availability.swift +++ b/test/Parse/diagnose_availability.swift @@ -26,3 +26,31 @@ func availableOnMultiplePlatforms() {} // expected-error@-1 {{'deprecated' can't be combined with shorthand specification 'OSX 10.0'}} // expected-error@-2 {{expected declaration}} func twoShorthandsFollowedByDeprecated() {} + +@available(*, unavailable, message: "\("message")") +// expected-error@-1{{'message' cannot be an interpolated string literal}} +func interpolatedMessage() {} + +@available(*, unavailable, message: """ + foobar message. + """) +func multilineMessage() {} +multilineMessage() +// expected-error@-1{{'multilineMessage()' is unavailable: foobar message.}} +// expected-note@-3{{'multilineMessage()' has been explicitly marked unavailable here}} + +@available(*, unavailable, message: " ") +func emptyMessage() {} +emptyMessage() +// expected-error@-1{{'emptyMessage()' is unavailable: }} +// expected-note@-3{{'emptyMessage()' has been explicitly marked unavailable here}} + +// expected-error@+1{{'message' cannot be an extended escaping string literal}} +@available(*, unavailable, message: #""" + foobar message. + """#) +func extendedEscapedMultilineMessage() {} + +// expected-error@+1{{'renamed' cannot be an extended escaping string literal}} +@available(*, unavailable, renamed: #"avialable()"#) +func extenedEscpaedRenamed() {} diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift new file mode 100644 index 0000000000000..953b92324b041 --- /dev/null +++ b/test/Parse/raw_string.swift @@ -0,0 +1,134 @@ +// RUN: %target-swift-frontend -dump-ast %s 2>&1 | %FileCheck --strict-whitespace %s + +import Swift + +_ = #""" +################################################################### +## This source file is part of the Swift.org open source project ## +################################################################### +"""# +// CHECK: "###################################################################\n## This source file is part of the Swift.org open source project ##\n###################################################################" + +_ = #""" + # H1 # + ## H2 ## + ### H3 ### + """# +// CHECK: "# H1 #\n## H2 ##\n### H3 ###" + +// ===---------- Multiline RawString --------=== + +_ = ##""" + One + ""Alpha"" + """## +// CHECK: "One\n\"\"Alpha\"\"" + +_ = ##""" + Two + Beta + """## +// CHECK: " Two\nBeta" + +_ = #""" + Three\r + Gamma\ + """# +// CHECK: " Three\\r\n Gamma\\" + +_ = ###""" + Four \(foo) + Delta +"""### +// CHECK: " Four \\(foo)\n Delta" + +_ = ##""" + print(""" + Five\##n\##n\##nEpsilon + """) + """## +// CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")" + +// ===---------- Single line --------=== + +_ = #""Zeta""# +// CHECK: "\"Zeta\"" + +_ = #""Eta"\#n\#n\#n\#""# +// CHECK: "\"Eta\"\n\n\n\"" + +_ = #""Iota"\n\n\n\""# +// CHECK: "\"Iota\"\\n\\n\\n\\\"" + +_ = #"a raw string with \" in it"# +// CHECK: "a raw string with \\\" in it" + +_ = ##""" + a raw string with """ in it + """## +// CHECK: "a raw string with \"\"\" in it" + +let foo = "Interpolation" +_ = #"\b\b \#(foo)\#(foo) Kappa"# +// CHECK: "\\b\\b " +// CHECK: " Kappa" + +_ = """ + interpolating \(##""" + delimited \##("string")\#n\##n + """##) + """ + +// CHECK: "interpolating " +// CHECK: "delimited " +// CHECK: "string" +// CHECK: "\\#n\n" + +#"unused literal"# +// CHECK: "unused literal" + +// ===---------- From proposal --------=== + +_ = #"This is a string"# +// CHECK: "This is a string" + +_ = #####"This is a string"##### +// CHECK: "This is a string" + +_ = #"enum\s+.+\{.*case\s+[:upper:]"# +// CHECK: "enum\\s+.+\\{.*case\\s+[:upper:]" + +_ = #"Alice: "How long is forever?" White Rabbit: "Sometimes, just one second.""# +// CHECK: "Alice: \"How long is forever?\" White Rabbit: \"Sometimes, just one second.\"" + +_ = #"\#\#1"# +// CHECK: "\\#1" + +_ = ##"\#1"## +// CHECK: "\\#1" + +_ = #"c:\windows\system32"# +// CHECK: "c:\\windows\\system32" + +_ = #"\d{3) \d{3} \d{4}"# +// CHECK: "\\d{3) \\d{3} \\d{4}" + +_ = #""" + a string with + """ + in it + """# +// CHECK: "a string with\n\"\"\"\nin it" + +_ = #"a raw string containing \r\n"# +// CHECK: "a raw string containing \\r\\n" + +_ = #""" + [ + { + "id": "12345", + "title": "A title that \"contains\" \\\"" + } + ] + """# +// CHECK: "[\n {\n \"id\": \"12345\",\n \"title\": \"A title that \\\"contains\\\" \\\\\\\"\"\n }\n]" diff --git a/test/Parse/raw_string_errors.swift b/test/Parse/raw_string_errors.swift new file mode 100644 index 0000000000000..ca19b41fe4dd2 --- /dev/null +++ b/test/Parse/raw_string_errors.swift @@ -0,0 +1,14 @@ +// RUN: %target-typecheck-verify-swift + +#"\##("invalid")"# +// expected-error@-1{{too many '#' characters in delimited escape}} +// expected-error@-2{{invalid escape sequence in literal}} + +####"invalid"### +// expected-error@-1{{unterminated string literal}} + +###"invalid"#### +// expected-error@-1{{too many '#' characters in closing delimiter}} +// expected-error@-2{{consecutive statements on a line must be separated by ';'}} +// expected-error@-3{{expected expression}} +// expected-warning@-4{{string literal is unused}} diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp index 3350182b9f294..0a996a8358aa8 100644 --- a/unittests/Parse/LexerTests.cpp +++ b/unittests/Parse/LexerTests.cpp @@ -1,12 +1,21 @@ #include "swift/AST/DiagnosticConsumer.h" #include "swift/AST/DiagnosticEngine.h" +#include "swift/Basic/Defer.h" #include "swift/Basic/LangOptions.h" #include "swift/Basic/SourceManager.h" #include "swift/Parse/Lexer.h" #include "swift/Subsystems.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Process.h" #include "gtest/gtest.h" +#if __has_include() +# include +# define HAS_MMAP 1 +#else +# define HAS_MMAP 0 +#endif + using namespace swift; using namespace llvm; @@ -778,3 +787,37 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) { ASSERT_FALSE(containsPrefix( DiagConsumer.messages, "1, 4: nul character embedded in middle of file")); } + +#if HAS_MMAP + +// This test requires mmap because llvm::sys::Memory doesn't support protecting +// pages to have no permissions. +TEST_F(LexerTest, EncodedStringSegmentPastTheEnd) { + size_t PageSize = llvm::sys::Process::getPageSize(); + + void *FirstPage = mmap(/*addr*/nullptr, PageSize * 2, PROT_NONE, + MAP_PRIVATE | MAP_ANON, /*fd*/-1, /*offset*/0); + SWIFT_DEFER { (void)munmap(FirstPage, PageSize * 2); }; + ASSERT_NE(FirstPage, MAP_FAILED); + int ProtectResult = mprotect(FirstPage, PageSize, PROT_READ | PROT_WRITE); + ASSERT_EQ(ProtectResult, 0); + + auto check = [FirstPage, PageSize](StringRef Input, StringRef Expected) { + char *StartPtr = static_cast(FirstPage) + PageSize - Input.size(); + memcpy(StartPtr, Input.data(), Input.size()); + + SmallString<64> Buffer; + StringRef Escaped = Lexer::getEncodedStringSegment({StartPtr, Input.size()}, + Buffer); + EXPECT_EQ(Escaped, Expected); + }; + + check("needs escaping\\r", + "needs escaping\r"); + check("does not need escaping", + "does not need escaping"); + check("invalid escape at the end \\", + "invalid escape at the end "); +} + +#endif // HAS_MMAP