diff --git a/include/clang/Basic/ConvertUTF.h b/include/clang/Basic/ConvertUTF.h index fb05afdae73e7ebc33dca682fd2dc791d9d690ad..38956ee340a3cad8108db5f4f35510c6d5af6911 100644 --- a/include/clang/Basic/ConvertUTF.h +++ b/include/clang/Basic/ConvertUTF.h @@ -161,6 +161,16 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); unsigned getNumBytesForUTF8(UTF8 firstByte); +static inline ConversionResult convertUTF8Sequence(const UTF8 **source, + const UTF8 *sourceEnd, + UTF32 *target, + ConversionFlags flags) { + unsigned size = getNumBytesForUTF8(**source); + if (size > sourceEnd - *source) + return sourceExhausted; + return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags); +} + #ifdef __cplusplus } diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index c8b44230c9e4151e6a2f278934afb83e5620b1e6..00b385ef124f61f5454aa999fc450bc146ec5ce6 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -93,15 +93,29 @@ def ext_multichar_character_literal : ExtWarn< "multi-character character constant">, InGroup<MultiChar>; def ext_four_char_character_literal : Extension< "multi-character character constant">, InGroup<FourByteMultiChar>; - -// Literal -def ext_nonstandard_escape : Extension< - "use of non-standard escape character '\\%0'">; -def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">; -def err_hex_escape_no_digits : Error<"\\%0 used with no following hex digits">; + +// Unicode and UCNs +def err_invalid_utf8 : Error< + "source file is not valid UTF-8">; +def err_non_ascii : Error< + "non-ASCII characters are not allowed outside of literals and identifiers">; +def ext_unicode_whitespace : ExtWarn< + "treating Unicode character as whitespace">, + InGroup<DiagGroup<"unicode-whitespace">>; + +def err_hex_escape_no_digits : Error< + "\\%0 used with no following hex digits">; +def warn_ucn_escape_no_digits : Warning< + "\\%0 used with no following hex digits; " + "treating as '\\' followed by identifier">, InGroup<Unicode>; +def err_ucn_escape_incomplete : Error< + "incomplete universal character name">; +def warn_ucn_escape_incomplete : Warning< + "incomplete universal character name; " + "treating as '\\' followed by identifier">, InGroup<Unicode>; def err_ucn_escape_invalid : Error<"invalid universal character">; -def err_ucn_escape_incomplete : Error<"incomplete universal character name">; + def err_ucn_escape_basic_scs : Error< "character '%0' cannot be specified by a universal character name">; def err_ucn_control_character : Error< @@ -112,6 +126,12 @@ def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning< def warn_cxx98_compat_literal_ucn_control_character : Warning< "universal character name referring to a control character " "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore; + + +// Literal +def ext_nonstandard_escape : Extension< + "use of non-standard escape character '\\%0'">; +def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">; def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">; def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">; def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">; diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index d36189fccd4142d9bfaf049325e993f1cb70cab8..535baf588f8400b15bd8b532b5bc4a8e8e770ee6 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -437,6 +437,11 @@ private: /// void LexTokenInternal(Token &Result); + /// Given that a token begins with the Unicode character \p C, figure out + /// what kind of token it is and dispatch to the appropriate lexing helper + /// function. + void LexUnicode(Token &Result, uint32_t C, const char *CurPtr); + /// FormTokenWithChars - When we lex a token, we have identified a span /// starting at BufferPtr, going to TokEnd that forms the token. This method /// takes that range and assigns it to the token as its location and size. In @@ -579,6 +584,21 @@ private: void cutOffLexing() { BufferPtr = BufferEnd; } bool isHexaLiteral(const char *Start, const LangOptions &LangOpts); + + + /// Read a universal character name. + /// + /// \param CurPtr The position in the source buffer after the initial '\'. + /// If the UCN is syntactically well-formed (but not necessarily + /// valid), this parameter will be updated to point to the + /// character after the UCN. + /// \param SlashLoc The position in the source buffer of the '\'. + /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics + /// and handle token formation in the caller. + /// + /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is + /// invalid. + uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok); }; diff --git a/include/clang/Lex/Token.h b/include/clang/Lex/Token.h index 06ff56ea9c942fe284bb2b9150c30a90bbfce936..bcbe9c913be19fda23a46681fb56a61f47ba2e5a 100644 --- a/include/clang/Lex/Token.h +++ b/include/clang/Lex/Token.h @@ -74,9 +74,10 @@ public: StartOfLine = 0x01, // At start of line or only after whitespace. LeadingSpace = 0x02, // Whitespace exists before this token. DisableExpand = 0x04, // This identifier may never be macro expanded. - NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. + NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. LeadingEmptyMacro = 0x10, // Empty macro exists before this token. - HasUDSuffix = 0x20 // This string or character literal has a ud-suffix. + HasUDSuffix = 0x20, // This string or character literal has a ud-suffix. + HasUCN = 0x40 // This identifier contains a UCN. }; tok::TokenKind getKind() const { return (tok::TokenKind)Kind; } @@ -257,6 +258,9 @@ public: /// \brief Return true if this token is a string or character literal which /// has a ud-suffix. bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; } + + /// Returns true if this token contains a universal character name. + bool hasUCN() const { return (Flags & HasUCN) ? true : false; } }; /// \brief Information about the conditional stack (\#if directives) diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 2bd95ab1d796f93dab4a388bababec36ad5a8354..e6ffca955436c378a73b93df1d69d289e595e16c 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -25,11 +25,13 @@ //===----------------------------------------------------------------------===// #include "clang/Lex/Lexer.h" +#include "clang/Basic/ConvertUTF.h" #include "clang/Basic/SourceManager.h" #include "clang/Lex/CodeCompletionHandler.h" #include "clang/Lex/LexDiagnostic.h" #include "clang/Lex/Preprocessor.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/MemoryBuffer.h" @@ -371,10 +373,12 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, // NOTE: this has to be checked *before* testing for an IdentifierInfo. if (Tok.is(tok::raw_identifier)) TokStart = Tok.getRawIdentifierData(); - else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { - // Just return the string from the identifier table, which is very quick. - Buffer = II->getNameStart(); - return II->getLength(); + else if (!Tok.hasUCN()) { + if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { + // Just return the string from the identifier table, which is very quick. + Buffer = II->getNameStart(); + return II->getLength(); + } } // NOTE: this can be checked even after testing for an IdentifierInfo. @@ -1376,7 +1380,6 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, /// 2. If this is an escaped newline (potentially with whitespace between /// the backslash and newline), implicitly skip the newline and return /// the char after it. -/// 3. If this is a UCN, return it. FIXME: C++ UCN's? /// /// This handles the slow/uncommon case of the getCharAndSize method. Here we /// know that we can accumulate into Size, and that we have already incremented @@ -1509,6 +1512,77 @@ void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { IsAtStartOfLine = StartOfLine; } +namespace { + struct UCNCharRange { + uint32_t Lower; + uint32_t Upper; + }; + + // C11 D.1, C++11 [charname.allowed] + // FIXME: C99 and C++03 each have a different set of allowed UCNs. + const UCNCharRange UCNAllowedCharRanges[] = { + // 1 + { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD }, + { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA }, + { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 }, + { 0x00F8, 0x00FF }, + // 2 + { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF }, + // 3 + { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 }, + { 0x2054, 0x2054 }, { 0x2060, 0x206F }, + // 4 + { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 }, + { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF }, + // 5 + { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F }, + // 6 + { 0x3040, 0xD7FF }, + // 7 + { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 }, + { 0xFE47, 0xFFFD }, + // 8 + { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD }, + { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD }, + { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD }, + { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD }, + { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD } + }; +} + +static bool isAllowedIDChar(uint32_t c) { + unsigned LowPoint = 0; + unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges); + + // Binary search the UCNAllowedCharRanges set. + while (HighPoint != LowPoint) { + unsigned MidPoint = (HighPoint + LowPoint) / 2; + if (c < UCNAllowedCharRanges[MidPoint].Lower) + HighPoint = MidPoint; + else if (c > UCNAllowedCharRanges[MidPoint].Upper) + LowPoint = MidPoint + 1; + else + return true; + } + + return false; +} + +static bool isAllowedInitiallyIDChar(uint32_t c) { + // C11 D.2, C++11 [charname.disallowed] + // FIXME: C99 only forbids "digits", presumably as described in C99 Annex D. + // FIXME: C++03 does not forbid any initial characters. + return !(0x0300 <= c && c <= 0x036F) && + !(0x1DC0 <= c && c <= 0x1DFF) && + !(0x20D0 <= c && c <= 0x20FF) && + !(0xFE20 <= c && c <= 0xFE2F); +} + +static inline bool isASCII(char C) { + return static_cast<signed char>(C) >= 0; +} + + void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] unsigned Size; @@ -1520,11 +1594,11 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Fast path, no $,\,? in identifier found. '\' might be an escaped newline // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. - // FIXME: UCNs. // // TODO: Could merge these checks into a CharInfo flag to make the comparison // cheaper - if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) { + if (isASCII(C) && C != '\\' && C != '?' && + (C != '$' || !LangOpts.DollarIdents)) { FinishIdentifier: const char *IdStart = BufferPtr; FormTokenWithChars(Result, CurPtr, tok::raw_identifier); @@ -1561,8 +1635,38 @@ FinishIdentifier: CurPtr = ConsumeChar(CurPtr, Size, Result); C = getCharAndSize(CurPtr, Size); continue; - } else if (!isIdentifierBody(C)) { // FIXME: UCNs. - // Found end of identifier. + + } else if (C == '\\') { + const char *UCNPtr = CurPtr + Size; + uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0); + if (CodePoint == 0 || !isAllowedIDChar(CodePoint)) + goto FinishIdentifier; + + Result.setFlag(Token::HasUCN); + if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || + (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) + CurPtr = UCNPtr; + else + while (CurPtr != UCNPtr) + (void)getAndAdvanceChar(CurPtr, Result); + + C = getCharAndSize(CurPtr, Size); + continue; + } else if (!isASCII(C)) { + const char *UnicodePtr = CurPtr; + UTF32 CodePoint; + ConversionResult Result = convertUTF8Sequence((const UTF8 **)&UnicodePtr, + (const UTF8 *)BufferEnd, + &CodePoint, + strictConversion); + if (Result != conversionOK || + !isAllowedIDChar(static_cast<uint32_t>(CodePoint))) + goto FinishIdentifier; + + CurPtr = UnicodePtr; + C = getCharAndSize(CurPtr, Size); + continue; + } else if (!isIdentifierBody(C)) { goto FinishIdentifier; } @@ -1570,7 +1674,7 @@ FinishIdentifier: CurPtr = ConsumeChar(CurPtr, Size, Result); C = getCharAndSize(CurPtr, Size); - while (isIdentifierBody(C)) { // FIXME: UCNs. + while (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); C = getCharAndSize(CurPtr, Size); } @@ -2592,6 +2696,135 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { return false; } +uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, + Token *Result) { + assert(LangOpts.CPlusPlus || LangOpts.C99); + + unsigned CharSize; + char Kind = getCharAndSize(StartPtr, CharSize); + + unsigned NumHexDigits; + if (Kind == 'u') + NumHexDigits = 4; + else if (Kind == 'U') + NumHexDigits = 8; + else + return 0; + + const char *CurPtr = StartPtr + CharSize; + const char *KindLoc = &CurPtr[-1]; + + uint32_t CodePoint = 0; + for (unsigned i = 0; i < NumHexDigits; ++i) { + char C = getCharAndSize(CurPtr, CharSize); + + unsigned Value = llvm::hexDigitValue(C); + if (Value == -1U) { + if (Result && !isLexingRawMode()) { + if (i == 0) { + Diag(BufferPtr, diag::warn_ucn_escape_no_digits) + << StringRef(KindLoc, 1); + } else { + // FIXME: if i == 4 and NumHexDigits == 8, suggest a fixit to \u. + Diag(BufferPtr, diag::warn_ucn_escape_incomplete); + } + } + + return 0; + } + + CodePoint <<= 4; + CodePoint += Value; + + CurPtr += CharSize; + } + + if (Result) { + Result->setFlag(Token::HasUCN); + if (CurPtr - StartPtr == NumHexDigits + 2) + StartPtr = CurPtr; + else + while (StartPtr != CurPtr) + (void)getAndAdvanceChar(StartPtr, *Result); + } else { + StartPtr = CurPtr; + } + + // C99 6.4.3p2: A universal character name shall not specify a character whose + // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or + // 0060 (`), nor one in the range D800 through DFFF inclusive.) + // C++11 [lex.charset]p2: If the hexadecimal value for a + // universal-character-name corresponds to a surrogate code point (in the + // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, + // if the hexadecimal value for a universal-character-name outside the + // c-char-sequence, s-char-sequence, or r-char-sequence of a character or + // string literal corresponds to a control character (in either of the + // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the + // basic source character set, the program is ill-formed. + if (CodePoint < 0xA0) { + if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) + return CodePoint; + + // We don't use isLexingRawMode() here because we need to warn about bad + // UCNs even when skipping preprocessing tokens in a #if block. + if (Result && PP) { + if (CodePoint < 0x20 || CodePoint >= 0x7F) + Diag(BufferPtr, diag::err_ucn_control_character); + else { + char C = static_cast<char>(CodePoint); + Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); + } + } + + return 0; + + } else if ((!LangOpts.CPlusPlus || LangOpts.CPlusPlus11) && + (CodePoint >= 0xD800 && CodePoint <= 0xDFFF)) { + // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. + // We don't use isLexingRawMode() here because we need to warn about bad + // UCNs even when skipping preprocessing tokens in a #if block. + if (Result && PP) + Diag(BufferPtr, diag::err_ucn_escape_invalid); + return 0; + } + + return CodePoint; +} + +void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { + if (isAllowedIDChar(C) && isAllowedInitiallyIDChar(C)) { + MIOpt.ReadToken(); + return LexIdentifier(Result, CurPtr); + } + + if (!isASCII(*BufferPtr) && !isAllowedIDChar(C)) { + // Non-ASCII characters tend to creep into source code unintentionally. + // Instead of letting the parser complain about the unknown token, + // just drop the character. + // Note that we can /only/ do this when the non-ASCII character is actually + // spelled as Unicode, not written as a UCN. The standard requires that + // we not throw away any possible preprocessor tokens, but there's a + // loophole in the mapping of Unicode characters to basic character set + // characters that allows us to map these particular characters to, say, + // whitespace. + if (!isLexingRawMode()) { + CharSourceRange CharRange = + CharSourceRange::getCharRange(getSourceLocation(), + getSourceLocation(CurPtr)); + Diag(BufferPtr, diag::err_non_ascii) + << FixItHint::CreateRemoval(CharRange); + } + + BufferPtr = CurPtr; + return LexTokenInternal(Result); + } + + // Otherwise, we have an explicit UCN or a character that's unlikely to show + // up by accident. + MIOpt.ReadToken(); + FormTokenWithChars(Result, CurPtr, tok::unknown); +} + /// LexTokenInternal - This implements a simple C family lexer. It is an /// extremely performance critical piece of code. This assumes that the buffer @@ -3243,12 +3476,41 @@ LexNextToken: Kind = tok::unknown; break; + // UCNs (C99 6.4.3, C++11 [lex.charset]p2) case '\\': - // FIXME: UCN's. - // FALL THROUGH. - default: + if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) + return LexUnicode(Result, CodePoint, CurPtr); + Kind = tok::unknown; break; + + default: { + if (isASCII(Char)) { + Kind = tok::unknown; + break; + } + + UTF32 CodePoint; + + // We can't just reset CurPtr to BufferPtr because BufferPtr may point to + // an escaped newline. + --CurPtr; + ConversionResult Status = convertUTF8Sequence((const UTF8 **)&CurPtr, + (const UTF8 *)BufferEnd, + &CodePoint, + strictConversion); + if (Status == conversionOK) + return LexUnicode(Result, CodePoint, CurPtr); + + // Non-ASCII characters tend to creep into source code unintentionally. + // Instead of letting the parser complain about the unknown token, + // just warn that we don't have valid UTF-8, then drop the character. + if (!isLexingRawMode()) + Diag(CurPtr, diag::err_invalid_utf8); + + BufferPtr = CurPtr+1; + goto LexNextToken; + } } // Notify MIOpt that we read a non-whitespace/non-comment token. diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp index c01019cf43051a328f4d5693c647ca51e7837219..b933a5fd75098048818f241a177c786cc69f9338 100644 --- a/lib/Lex/Preprocessor.cpp +++ b/lib/Lex/Preprocessor.cpp @@ -27,6 +27,7 @@ #include "clang/Lex/Preprocessor.h" #include "MacroArgs.h" +#include "clang/Basic/ConvertUTF.h" #include "clang/Basic/FileManager.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/TargetInfo.h" @@ -43,6 +44,8 @@ #include "clang/Lex/ScratchBuffer.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/Capacity.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" @@ -396,7 +399,7 @@ StringRef Preprocessor::getSpelling(const Token &Tok, SmallVectorImpl<char> &Buffer, bool *Invalid) const { // NOTE: this has to be checked *before* testing for an IdentifierInfo. - if (Tok.isNot(tok::raw_identifier)) { + if (Tok.isNot(tok::raw_identifier) && !Tok.hasUCN()) { // Try the fast path. if (const IdentifierInfo *II = Tok.getIdentifierInfo()) return II->getName(); @@ -494,6 +497,48 @@ void Preprocessor::EndSourceFile() { // Lexer Event Handling. //===----------------------------------------------------------------------===// +static void appendCodePoint(unsigned Codepoint, + llvm::SmallVectorImpl<char> &Str) { + char ResultBuf[4]; + char *ResultPtr = ResultBuf; + bool Res = ConvertCodePointToUTF8(Codepoint, ResultPtr); + (void)Res; + assert(Res && "Unexpected conversion failure"); + Str.append(ResultBuf, ResultPtr); +} + +static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { + for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { + if (*I != '\\') { + Buf.push_back(*I); + continue; + } + + ++I; + assert(*I == 'u' || *I == 'U'); + + unsigned NumHexDigits; + if (*I == 'u') + NumHexDigits = 4; + else + NumHexDigits = 8; + + assert(I + NumHexDigits <= E); + + uint32_t CodePoint = 0; + for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { + unsigned Value = llvm::hexDigitValue(*I); + assert(Value != -1U); + + CodePoint <<= 4; + CodePoint += Value; + } + + appendCodePoint(CodePoint, Buf); + --I; + } +} + /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the /// identifier information for the token and install it into the token, /// updating the token kind accordingly. @@ -502,15 +547,22 @@ IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier) const { // Look up this token, see if it is a macro, or if it is a language keyword. IdentifierInfo *II; - if (!Identifier.needsCleaning()) { + if (!Identifier.needsCleaning() && !Identifier.hasUCN()) { // No cleaning needed, just use the characters from the lexed buffer. II = getIdentifierInfo(StringRef(Identifier.getRawIdentifierData(), - Identifier.getLength())); + Identifier.getLength())); } else { // Cleaning needed, alloca a buffer, clean into it, then use the buffer. SmallString<64> IdentifierBuffer; StringRef CleanedStr = getSpelling(Identifier, IdentifierBuffer); - II = getIdentifierInfo(CleanedStr); + + if (Identifier.hasUCN()) { + SmallString<64> UCNIdentifierBuffer; + expandUCNs(UCNIdentifierBuffer, CleanedStr); + II = getIdentifierInfo(UCNIdentifierBuffer); + } else { + II = getIdentifierInfo(CleanedStr); + } } // Update the token info (identifier info and appropriate token kind). diff --git a/test/CXX/over/over.oper/over.literal/p8.cpp b/test/CXX/over/over.oper/over.literal/p8.cpp index 6f636104e45d820637b74c4b0a33de6309c8d080..70a184372cf59991eb8b623cdffe4f60297df89e 100644 --- a/test/CXX/over/over.oper/over.literal/p8.cpp +++ b/test/CXX/over/over.oper/over.literal/p8.cpp @@ -7,8 +7,7 @@ namespace std { void operator "" _km(long double); // ok string operator "" _i18n(const char*, std::size_t); // ok -// FIXME: This should be accepted once we support UCNs -template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-error {{expected identifier}} +template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-warning {{reserved}} float operator ""E(const char *); // expected-error {{invalid suffix on literal}} expected-warning {{reserved}} float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}} string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}} diff --git a/test/CodeGen/ucn-identifiers.c b/test/CodeGen/ucn-identifiers.c new file mode 100644 index 0000000000000000000000000000000000000000..56e3aa5ad84eea72d454b8b7ab94273286528c38 --- /dev/null +++ b/test/CodeGen/ucn-identifiers.c @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 %s -emit-llvm -o /dev/null +// RUN: %clang_cc1 %s -emit-llvm -o /dev/null -x c++ +// This file contains UTF-8; please do not fix! + + +extern void \u00FCber(int); +extern void \U000000FCber(int); // redeclaration, no warning + +void goodCalls() { + \u00FCber(0); + \u00fcber(1); + über(2); + \U000000FCber(3); +} diff --git a/test/FixIt/fixit-unicode.c b/test/FixIt/fixit-unicode.c index 2af5e08faa41f3cc0e3a4597b45c6b3e7222ffd7..c45ba0663f43f3ba322fda7aa7a5dad1a98fb24c 100644 --- a/test/FixIt/fixit-unicode.c +++ b/test/FixIt/fixit-unicode.c @@ -8,13 +8,15 @@ struct Foo { // PR13312 void test1() { struct Foo foo; - (&foo)☃>bar = 42; + foo.bar = 42☃ +// CHECK: error: non-ASCII characters are not allowed outside of literals and identifiers +// CHECK: {{^ \^}} // CHECK: error: expected ';' after expression // Make sure we emit the fixit right in front of the snowman. -// CHECK: {{^ \^}} -// CHECK: {{^ ;}} +// CHECK: {{^ \^}} +// CHECK: {{^ ;}} -// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{11:9-11:9}:";" +// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-8]]:15-[[@LINE-8]]:15}:";" } @@ -29,5 +31,5 @@ void test2() { // because different systems will render the delta differently (either as a // character, or as <U+2206>.) The fixit should line up with the %d regardless. -// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{23:16-23:18}:"%ld" +// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-9]]:16-[[@LINE-9]]:18}:"%ld" } diff --git a/test/Lexer/utf8-invalid.c b/test/Lexer/utf8-invalid.c new file mode 100644 index 0000000000000000000000000000000000000000..c4dd318e786acb5e993fe250a4df5ed150bc4218 --- /dev/null +++ b/test/Lexer/utf8-invalid.c @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +// Note: this file contains invalid UTF-8 before the variable name in the +// next line. Please do not fix! + +extern int ‚x; // expected-error{{source file is not valid UTF-8}} diff --git a/test/Preprocessor/ucn-pp-identifier.c b/test/Preprocessor/ucn-pp-identifier.c new file mode 100644 index 0000000000000000000000000000000000000000..f4afa91ed3a0cdb58e4c51f11d8575c872944ef4 --- /dev/null +++ b/test/Preprocessor/ucn-pp-identifier.c @@ -0,0 +1,97 @@ +// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify -Wundef +// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef + +#define \u00FC +#define a\u00FD() 0 +#ifndef \u00FC +#error "This should never happen" +#endif + +#if a\u00FD() +#error "This should never happen" +#endif + +#if a\U000000FD() +#error "This should never happen" +#endif + +#if \uarecool // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}} +#endif +#if \uwerecool // expected-warning{{\u used with no following hex digits; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}} +#endif +#if \U0001000 // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}} +#endif + +// Make sure we reject disallowed UCNs +#define \ufffe // expected-error {{macro names must be identifiers}} +#define \U10000000 // expected-error {{macro names must be identifiers}} +#define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro names must be identifiers}} + +// FIXME: Not clear what our behavior should be here; \u0024 is "$". +#define a\u0024 // expected-warning {{whitespace}} + +#if \u0110 // expected-warning {{is not defined, evaluates to 0}} +#endif + + +#define \u0110 1 / 0 +#if \u0110 // expected-error {{division by zero in preprocessor expression}} +#endif + +#define STRINGIZE(X) # X + +extern int check_size[sizeof(STRINGIZE(\u0112)) == 3 ? 1 : -1]; + +// Check that we still diagnose disallowed UCNs in #if 0 blocks. +// C99 5.1.1.2p1 and C++11 [lex.phases]p1 dictate that preprocessor tokens are +// formed before directives are parsed. +// expected-error@+4 {{character 'a' cannot be specified by a universal character name}} +#if 0 +#define \ufffe // okay +#define \U10000000 // okay +#define \u0061 // error, but -verify only looks at comments outside #if 0 +#endif + + +// A UCN formed by token pasting is undefined in both C99 and C++. +// Right now we don't do anything special, which causes us to coincidentally +// accept the first case below but reject the second two. +#define PASTE(A, B) A ## B +extern int PASTE(\, u00FD); +extern int PASTE(\u, 00FD); // expected-warning{{\u used with no following hex digits}} +extern int PASTE(\u0, 0FD); // expected-warning{{incomplete universal character name}} +#ifdef __cplusplus +// expected-error@-3 {{expected unqualified-id}} +// expected-error@-3 {{expected unqualified-id}} +#else +// expected-error@-6 {{expected identifier}} +// expected-error@-6 {{expected identifier}} +#endif + + +// A UCN produced by line splicing is valid in C99 but undefined in C++. +// Since undefined behavior can do anything including working as intended, +// we just accept it in C++ as well.; +#define newline_1_\u00F\ +C 1 +#define newline_2_\u00\ +F\ +C 1 +#define newline_3_\u\ +00\ +FC 1 +#define newline_4_\\ +u00FC 1 +#define newline_5_\\ +u\ +\ +0\ +0\ +F\ +C 1 + +#if (newline_1_\u00FC && newline_2_\u00FC && newline_3_\u00FC && \ + newline_4_\u00FC && newline_5_\u00FC) +#else +#error "Line splicing failed to produce UCNs" +#endif diff --git a/test/Sema/ucn-identifiers.c b/test/Sema/ucn-identifiers.c new file mode 100644 index 0000000000000000000000000000000000000000..6b2636587af661b2b60c9ac1bc746351c00a9d51 --- /dev/null +++ b/test/Sema/ucn-identifiers.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 %s -verify -fsyntax-only -pedantic +// RUN: %clang_cc1 %s -verify -fsyntax-only -x c++ -pedantic + +// This file contains UTF-8; please do not fix! + + +extern void \u00FCber(int); +extern void \U000000FCber(int); // redeclaration, no warning +#ifdef __cplusplus +// expected-note@-2 + {{candidate function not viable}} +#else +// expected-note@-4 + {{declared here}} +#endif + +void goodCalls() { + \u00FCber(0); + \u00fcber(1); + über(2); + \U000000FCber(3); +} + +void badCalls() { + \u00FCber(0.5); // expected-warning{{implicit conversion from 'double' to 'int'}} + \u00fcber = 0; // expected-error{{non-object type 'void (int)' is not assignable}} + + über(1, 2); + \U000000FCber(); +#ifdef __cplusplus + // expected-error@-3 {{no matching function}} + // expected-error@-3 {{no matching function}} +#else + // expected-error@-6 {{too many arguments to function call, expected 1, have 2}} + // expected-error@-6 {{too few arguments to function call, expected 1, have 0}} +#endif +}