//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// // // This file implements an interface defined in ResourceScriptToken.h. // In particular, it defines an .rc script tokenizer. // //===---------------------------------------------------------------------===// #include "ResourceScriptToken.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include #include using namespace llvm; using Kind = RCToken::Kind; // Checks if Representation is a correct description of an RC integer. // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' // character (that is the difference between our representation and // StringRef's one). If Representation is correct, 'true' is returned and // the return value is put back in Num. static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { size_t Length = Representation.size(); if (Length == 0) return false; // Strip the last 'L' if unnecessary. if (std::toupper(Representation.back()) == 'L') Representation = Representation.drop_back(1); return !Representation.getAsInteger(0, Num); } RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) : TokenKind(RCTokenKind), TokenValue(Value) {} uint32_t RCToken::intValue() const { assert(TokenKind == Kind::Int); // We assume that the token already is a correct integer (checked by // rcGetAsInteger). uint32_t Result; bool IsSuccess = rcGetAsInteger(TokenValue, Result); assert(IsSuccess); (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. return Result; } bool RCToken::isLongInt() const { return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L'; } StringRef RCToken::value() const { return TokenValue; } Kind RCToken::kind() const { return TokenKind; } bool RCToken::isBinaryOp() const { switch (TokenKind) { case Kind::Plus: case Kind::Minus: case Kind::Pipe: case Kind::Amp: return true; default: return false; } } static Error getStringError(const Twine &message) { return make_error("Error parsing file: " + message, inconvertibleErrorCode()); } namespace { class Tokenizer { public: Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {} Expected> run(); private: // All 'advancing' methods return boolean values; if they're equal to false, // the stream has ended or failed. bool advance(size_t Amount = 1); bool skipWhitespaces(); // Consumes a token. If any problem occurred, a non-empty Error is returned. Error consumeToken(const Kind TokenKind); // Check if tokenizer is about to read FollowingChars. bool willNowRead(StringRef FollowingChars) const; // Check if tokenizer can start reading an identifier at current position. // The original tool did non specify the rules to determine what is a correct // identifier. We assume they should follow the C convention: // [a-zA-Z_][a-zA-Z0-9_]*. bool canStartIdentifier() const; // Check if tokenizer can continue reading an identifier. bool canContinueIdentifier() const; // Check if tokenizer can start reading an integer. // A correct integer always starts with a 0-9 digit, // can contain characters 0-9A-Fa-f (digits), // Ll (marking the integer is 32-bit), Xx (marking the representation // is hexadecimal). As some kind of separator should come after the // integer, we can consume the integer until a non-alphanumeric // character. bool canStartInt() const; bool canContinueInt() const; bool canStartString() const; // Check if tokenizer can start reading a single line comment (e.g. a comment // that begins with '//') bool canStartLineComment() const; // Check if tokenizer can start or finish reading a block comment (e.g. a // comment that begins with '/*' and ends with '*/') bool canStartBlockComment() const; // Throw away all remaining characters on the current line. void skipCurrentLine(); bool streamEof() const; // Classify the token that is about to be read from the current position. Kind classifyCurrentToken() const; // Process the Kind::Identifier token - check if it is // an identifier describing a block start or end. void processIdentifier(RCToken &token) const; StringRef Data; size_t DataLength, Pos; }; void Tokenizer::skipCurrentLine() { Pos = Data.find_first_of("\r\n", Pos); Pos = Data.find_first_not_of("\r\n", Pos); if (Pos == StringRef::npos) Pos = DataLength; } Expected> Tokenizer::run() { Pos = 0; std::vector Result; // Consume an optional UTF-8 Byte Order Mark. if (willNowRead("\xef\xbb\xbf")) advance(3); while (!streamEof()) { if (!skipWhitespaces()) break; Kind TokenKind = classifyCurrentToken(); if (TokenKind == Kind::Invalid) return getStringError("Invalid token found at position " + Twine(Pos)); const size_t TokenStart = Pos; if (Error TokenError = consumeToken(TokenKind)) return std::move(TokenError); // Comments are just deleted, don't bother saving them. if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment) continue; RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); if (TokenKind == Kind::Identifier) { processIdentifier(Token); } else if (TokenKind == Kind::Int) { uint32_t TokenInt; if (!rcGetAsInteger(Token.value(), TokenInt)) { // The integer has incorrect format or cannot be represented in // a 32-bit integer. return getStringError("Integer invalid or too large: " + Token.value().str()); } } Result.push_back(Token); } return Result; } bool Tokenizer::advance(size_t Amount) { Pos += Amount; return !streamEof(); } bool Tokenizer::skipWhitespaces() { while (!streamEof() && isSpace(Data[Pos])) advance(); return !streamEof(); } Error Tokenizer::consumeToken(const Kind TokenKind) { switch (TokenKind) { // One-character token consumption. #define TOKEN(Name) #define SHORT_TOKEN(Name, Ch) case Kind::Name: #include "ResourceScriptTokenList.def" advance(); return Error::success(); case Kind::LineComment: advance(2); skipCurrentLine(); return Error::success(); case Kind::StartComment: { advance(2); auto EndPos = Data.find("*/", Pos); if (EndPos == StringRef::npos) return getStringError( "Unclosed multi-line comment beginning at position " + Twine(Pos)); advance(EndPos - Pos); advance(2); return Error::success(); } case Kind::Identifier: while (!streamEof() && canContinueIdentifier()) advance(); return Error::success(); case Kind::Int: while (!streamEof() && canContinueInt()) advance(); return Error::success(); case Kind::String: // Consume the preceding 'L', if there is any. if (std::toupper(Data[Pos]) == 'L') advance(); // Consume the double-quote. advance(); // Consume the characters until the end of the file, line or string. while (true) { if (streamEof()) { return getStringError("Unterminated string literal."); } else if (Data[Pos] == '"') { // Consume the ending double-quote. advance(); // However, if another '"' follows this double-quote, the string didn't // end and we just included '"' into the string. if (!willNowRead("\"")) return Error::success(); } else if (Data[Pos] == '\n') { return getStringError("String literal not terminated in the line."); } advance(); } case Kind::Invalid: assert(false && "Cannot consume an invalid token."); } llvm_unreachable("Unknown RCToken::Kind"); } bool Tokenizer::willNowRead(StringRef FollowingChars) const { return Data.drop_front(Pos).startswith(FollowingChars); } bool Tokenizer::canStartIdentifier() const { assert(!streamEof()); const char CurChar = Data[Pos]; return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.'; } bool Tokenizer::canContinueIdentifier() const { assert(!streamEof()); const char CurChar = Data[Pos]; return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' || CurChar == '/' || CurChar == '\\'; } bool Tokenizer::canStartInt() const { assert(!streamEof()); return std::isdigit(Data[Pos]); } bool Tokenizer::canStartBlockComment() const { assert(!streamEof()); return Data.drop_front(Pos).startswith("/*"); } bool Tokenizer::canStartLineComment() const { assert(!streamEof()); return Data.drop_front(Pos).startswith("//"); } bool Tokenizer::canContinueInt() const { assert(!streamEof()); return std::isalnum(Data[Pos]); } bool Tokenizer::canStartString() const { return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\""); } bool Tokenizer::streamEof() const { return Pos == DataLength; } Kind Tokenizer::classifyCurrentToken() const { if (canStartBlockComment()) return Kind::StartComment; if (canStartLineComment()) return Kind::LineComment; if (canStartInt()) return Kind::Int; if (canStartString()) return Kind::String; // BEGIN and END are at this point of lexing recognized as identifiers. if (canStartIdentifier()) return Kind::Identifier; const char CurChar = Data[Pos]; switch (CurChar) { // One-character token classification. #define TOKEN(Name) #define SHORT_TOKEN(Name, Ch) \ case Ch: \ return Kind::Name; #include "ResourceScriptTokenList.def" default: return Kind::Invalid; } } void Tokenizer::processIdentifier(RCToken &Token) const { assert(Token.kind() == Kind::Identifier); StringRef Name = Token.value(); if (Name.equals_lower("begin")) Token = RCToken(Kind::BlockBegin, Name); else if (Name.equals_lower("end")) Token = RCToken(Kind::BlockEnd, Name); } } // anonymous namespace namespace llvm { Expected> tokenizeRC(StringRef Input) { return Tokenizer(Input).run(); } } // namespace llvm