123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377 |
- #pragma once
- #ifdef __GNUC__
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wunused-parameter"
- #endif
- //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This file defines lexer for structured comments and supporting token class.
- //
- //===----------------------------------------------------------------------===//
- #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
- #define LLVM_CLANG_AST_COMMENTLEXER_H
- #include "clang/Basic/Diagnostic.h"
- #include "clang/Basic/SourceManager.h"
- #include "llvm/ADT/SmallString.h"
- #include "llvm/ADT/StringRef.h"
- #include "llvm/Support/Allocator.h"
- #include "llvm/Support/raw_ostream.h"
- namespace clang {
- namespace comments {
- class Lexer;
- class TextTokenRetokenizer;
- struct CommandInfo;
- class CommandTraits;
- namespace tok {
- enum TokenKind {
- eof,
- newline,
- text,
- unknown_command, // Command that does not have an ID.
- backslash_command, // Command with an ID, that used backslash marker.
- at_command, // Command with an ID, that used 'at' marker.
- verbatim_block_begin,
- verbatim_block_line,
- verbatim_block_end,
- verbatim_line_name,
- verbatim_line_text,
- html_start_tag, // <tag
- html_ident, // attr
- html_equals, // =
- html_quoted_string, // "blah\"blah" or 'blah\'blah'
- html_greater, // >
- html_slash_greater, // />
- html_end_tag // </tag
- };
- } // end namespace tok
- /// Comment token.
- class Token {
- friend class Lexer;
- friend class TextTokenRetokenizer;
- /// The location of the token.
- SourceLocation Loc;
- /// The actual kind of the token.
- tok::TokenKind Kind;
- /// Integer value associated with a token.
- ///
- /// If the token is a known command, contains command ID and TextPtr is
- /// unused (command spelling can be found with CommandTraits). Otherwise,
- /// contains the length of the string that starts at TextPtr.
- unsigned IntVal;
- /// Length of the token spelling in comment. Can be 0 for synthenized
- /// tokens.
- unsigned Length;
- /// Contains text value associated with a token.
- const char *TextPtr;
- public:
- SourceLocation getLocation() const LLVM_READONLY { return Loc; }
- void setLocation(SourceLocation SL) { Loc = SL; }
- SourceLocation getEndLocation() const LLVM_READONLY {
- if (Length == 0 || Length == 1)
- return Loc;
- return Loc.getLocWithOffset(Length - 1);
- }
- tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
- void setKind(tok::TokenKind K) { Kind = K; }
- bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
- bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
- unsigned getLength() const LLVM_READONLY { return Length; }
- void setLength(unsigned L) { Length = L; }
- StringRef getText() const LLVM_READONLY {
- assert(is(tok::text));
- return StringRef(TextPtr, IntVal);
- }
- void setText(StringRef Text) {
- assert(is(tok::text));
- TextPtr = Text.data();
- IntVal = Text.size();
- }
- StringRef getUnknownCommandName() const LLVM_READONLY {
- assert(is(tok::unknown_command));
- return StringRef(TextPtr, IntVal);
- }
- void setUnknownCommandName(StringRef Name) {
- assert(is(tok::unknown_command));
- TextPtr = Name.data();
- IntVal = Name.size();
- }
- unsigned getCommandID() const LLVM_READONLY {
- assert(is(tok::backslash_command) || is(tok::at_command));
- return IntVal;
- }
- void setCommandID(unsigned ID) {
- assert(is(tok::backslash_command) || is(tok::at_command));
- IntVal = ID;
- }
- unsigned getVerbatimBlockID() const LLVM_READONLY {
- assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
- return IntVal;
- }
- void setVerbatimBlockID(unsigned ID) {
- assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
- IntVal = ID;
- }
- StringRef getVerbatimBlockText() const LLVM_READONLY {
- assert(is(tok::verbatim_block_line));
- return StringRef(TextPtr, IntVal);
- }
- void setVerbatimBlockText(StringRef Text) {
- assert(is(tok::verbatim_block_line));
- TextPtr = Text.data();
- IntVal = Text.size();
- }
- unsigned getVerbatimLineID() const LLVM_READONLY {
- assert(is(tok::verbatim_line_name));
- return IntVal;
- }
- void setVerbatimLineID(unsigned ID) {
- assert(is(tok::verbatim_line_name));
- IntVal = ID;
- }
- StringRef getVerbatimLineText() const LLVM_READONLY {
- assert(is(tok::verbatim_line_text));
- return StringRef(TextPtr, IntVal);
- }
- void setVerbatimLineText(StringRef Text) {
- assert(is(tok::verbatim_line_text));
- TextPtr = Text.data();
- IntVal = Text.size();
- }
- StringRef getHTMLTagStartName() const LLVM_READONLY {
- assert(is(tok::html_start_tag));
- return StringRef(TextPtr, IntVal);
- }
- void setHTMLTagStartName(StringRef Name) {
- assert(is(tok::html_start_tag));
- TextPtr = Name.data();
- IntVal = Name.size();
- }
- StringRef getHTMLIdent() const LLVM_READONLY {
- assert(is(tok::html_ident));
- return StringRef(TextPtr, IntVal);
- }
- void setHTMLIdent(StringRef Name) {
- assert(is(tok::html_ident));
- TextPtr = Name.data();
- IntVal = Name.size();
- }
- StringRef getHTMLQuotedString() const LLVM_READONLY {
- assert(is(tok::html_quoted_string));
- return StringRef(TextPtr, IntVal);
- }
- void setHTMLQuotedString(StringRef Str) {
- assert(is(tok::html_quoted_string));
- TextPtr = Str.data();
- IntVal = Str.size();
- }
- StringRef getHTMLTagEndName() const LLVM_READONLY {
- assert(is(tok::html_end_tag));
- return StringRef(TextPtr, IntVal);
- }
- void setHTMLTagEndName(StringRef Name) {
- assert(is(tok::html_end_tag));
- TextPtr = Name.data();
- IntVal = Name.size();
- }
- void dump(const Lexer &L, const SourceManager &SM) const;
- };
- /// Comment lexer.
- class Lexer {
- private:
- Lexer(const Lexer &) = delete;
- void operator=(const Lexer &) = delete;
- /// Allocator for strings that are semantic values of tokens and have to be
- /// computed (for example, resolved decimal character references).
- llvm::BumpPtrAllocator &Allocator;
- DiagnosticsEngine &Diags;
- const CommandTraits &Traits;
- const char *const BufferStart;
- const char *const BufferEnd;
- const char *BufferPtr;
- /// One past end pointer for the current comment. For BCPL comments points
- /// to newline or BufferEnd, for C comments points to star in '*/'.
- const char *CommentEnd;
- SourceLocation FileLoc;
- /// If true, the commands, html tags, etc will be parsed and reported as
- /// separate tokens inside the comment body. If false, the comment text will
- /// be parsed into text and newline tokens.
- bool ParseCommands;
- enum LexerCommentState : uint8_t {
- LCS_BeforeComment,
- LCS_InsideBCPLComment,
- LCS_InsideCComment,
- LCS_BetweenComments
- };
- /// Low-level lexer state, track if we are inside or outside of comment.
- LexerCommentState CommentState;
- enum LexerState : uint8_t {
- /// Lexing normal comment text
- LS_Normal,
- /// Finished lexing verbatim block beginning command, will lex first body
- /// line.
- LS_VerbatimBlockFirstLine,
- /// Lexing verbatim block body line-by-line, skipping line-starting
- /// decorations.
- LS_VerbatimBlockBody,
- /// Finished lexing verbatim line beginning command, will lex text (one
- /// line).
- LS_VerbatimLineText,
- /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
- LS_HTMLStartTag,
- /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
- LS_HTMLEndTag
- };
- /// Current lexing mode.
- LexerState State;
- /// If State is LS_VerbatimBlock, contains the name of verbatim end
- /// command, including command marker.
- SmallString<16> VerbatimBlockEndCommandName;
- /// Given a character reference name (e.g., "lt"), return the character that
- /// it stands for (e.g., "<").
- StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
- /// Given a Unicode codepoint as base-10 integer, return the character.
- StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
- /// Given a Unicode codepoint as base-16 integer, return the character.
- StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
- void formTokenWithChars(Token &Result, const char *TokEnd,
- tok::TokenKind Kind);
- void formTextToken(Token &Result, const char *TokEnd) {
- StringRef Text(BufferPtr, TokEnd - BufferPtr);
- formTokenWithChars(Result, TokEnd, tok::text);
- Result.setText(Text);
- }
- SourceLocation getSourceLocation(const char *Loc) const {
- assert(Loc >= BufferStart && Loc <= BufferEnd &&
- "Location out of range for this buffer!");
- const unsigned CharNo = Loc - BufferStart;
- return FileLoc.getLocWithOffset(CharNo);
- }
- DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
- return Diags.Report(Loc, DiagID);
- }
- /// Eat string matching regexp \code \s*\* \endcode.
- void skipLineStartingDecorations();
- /// Skip over pure text.
- const char *skipTextToken();
- /// Lex comment text, including commands if ParseCommands is set to true.
- void lexCommentText(Token &T);
- void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
- const CommandInfo *Info);
- void lexVerbatimBlockFirstLine(Token &T);
- void lexVerbatimBlockBody(Token &T);
- void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
- const CommandInfo *Info);
- void lexVerbatimLineText(Token &T);
- void lexHTMLCharacterReference(Token &T);
- void setupAndLexHTMLStartTag(Token &T);
- void lexHTMLStartTag(Token &T);
- void setupAndLexHTMLEndTag(Token &T);
- void lexHTMLEndTag(Token &T);
- public:
- Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
- const CommandTraits &Traits, SourceLocation FileLoc,
- const char *BufferStart, const char *BufferEnd,
- bool ParseCommands = true);
- void lex(Token &T);
- StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
- };
- } // end namespace comments
- } // end namespace clang
- #endif
- #ifdef __GNUC__
- #pragma GCC diagnostic pop
- #endif
|