You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
249 lines
9.5 KiB
249 lines
9.5 KiB
/** \file
|
|
* Base interface for any ANTLR3 lexer.
|
|
*
|
|
* An ANLTR3 lexer builds from two sets of components:
|
|
*
|
|
* - The runtime components that provide common functionality such as
|
|
* traversing character streams, building tokens for output and so on.
|
|
* - The generated rules and struutre of the actual lexer, which call upon the
|
|
* runtime components.
|
|
*
|
|
* A lexer class contains a character input stream, a base recognizer interface
|
|
* (which it will normally implement) and a token source interface (which it also
|
|
* implements. The Tokensource interface is called by a token consumer (such as
|
|
* a parser, but in theory it can be anything that wants a set of abstract
|
|
* tokens in place of a raw character stream.
|
|
*
|
|
* So then, we set up a lexer in a sequence akin to:
|
|
*
|
|
* - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
|
|
* and initialize it.
|
|
* - Create a lexer interface and tell it where it its input stream is.
|
|
* This will cause the creation of a base recognizer class, which it will
|
|
* override with its own implementations of some methods. The lexer creator
|
|
* can also then in turn override anything it likes.
|
|
* - The lexer token source interface is then passed to some interface that
|
|
* knows how to use it, byte calling for a next token.
|
|
* - When a next token is called, let ze lexing begin.
|
|
*
|
|
*/
|
|
#ifndef _ANTLR3_LEXER_HPP
|
|
#define _ANTLR3_LEXER_HPP
|
|
|
|
// [The "BSD licence"]
|
|
// Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
|
|
|
|
//
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// 1. Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// 2. Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// 3. The name of the author may not be used to endorse or promote products
|
|
// derived from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
/* Definitions
|
|
*/
|
|
#include "antlr3defs.hpp"
|
|
|
|
ANTLR_BEGIN_NAMESPACE()
|
|
|
|
static const ANTLR_UINT32 ANTLR_STRING_TERMINATOR = 0xFFFFFFFF;
|
|
|
|
template<class ImplTraits>
|
|
class Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >,
|
|
public ImplTraits::TokenSourceType
|
|
{
|
|
public:
|
|
typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
|
|
typedef typename ImplTraits::InputStreamType InputStreamType;
|
|
typedef InputStreamType StreamType;
|
|
typedef typename InputStreamType::IntStreamType IntStreamType;
|
|
typedef typename ImplTraits::CommonTokenType CommonTokenType;
|
|
typedef typename ImplTraits::StreamDataType TokenType;
|
|
typedef typename ImplTraits::StringType StringType;
|
|
typedef typename ImplTraits::StringStreamType StringStreamType;
|
|
typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType;
|
|
typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType;
|
|
typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType;
|
|
typedef typename ImplTraits::BitsetListType BitsetListType;
|
|
typedef typename ImplTraits::TokenSourceType TokenSourceType;
|
|
|
|
typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType;
|
|
typedef typename RecognizerType::DebugEventListenerType DebuggerType;
|
|
|
|
private:
|
|
/** A pointer to the character stream whence this lexer is receiving
|
|
* characters.
|
|
* TODO: I may come back to this and implement charstream outside
|
|
* the input stream as per the java implementation.
|
|
*/
|
|
InputStreamType* m_input;
|
|
|
|
public:
|
|
Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
|
|
Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state);
|
|
|
|
InputStreamType* get_input() const;
|
|
IntStreamType* get_istream() const;
|
|
RecognizerType* get_rec();
|
|
const RecognizerType* get_rec() const;
|
|
TokenSourceType* get_tokSource();
|
|
|
|
//functions used in .stg file
|
|
const RecognizerType* get_recognizer() const;
|
|
RecognizerSharedStateType* get_lexstate() const;
|
|
void set_lexstate( RecognizerSharedStateType* lexstate );
|
|
const TokenSourceType* get_tokSource() const;
|
|
CommonTokenType* get_ltoken() const;
|
|
void set_ltoken( const CommonTokenType* ltoken );
|
|
bool hasFailed() const;
|
|
ANTLR_INT32 get_backtracking() const;
|
|
void inc_backtracking();
|
|
void dec_backtracking();
|
|
bool get_failedflag() const;
|
|
void set_failedflag( bool failed );
|
|
InputStreamType* get_strstream() const;
|
|
ANTLR_MARKER index() const;
|
|
void seek(ANTLR_MARKER index);
|
|
const CommonTokenType* EOF_Token() const;
|
|
bool hasException() const;
|
|
ExceptionBaseType* get_exception() const;
|
|
void constructEx();
|
|
void lrecover();
|
|
ANTLR_MARKER mark();
|
|
void rewind(ANTLR_MARKER marker);
|
|
void rewindLast();
|
|
void setText( const StringType& text );
|
|
void skip();
|
|
RuleMemoType* getRuleMemo() const;
|
|
DebuggerType* get_debugger() const;
|
|
void setRuleMemo(RuleMemoType* rulememo);
|
|
ANTLR_UINT32 LA(ANTLR_INT32 i);
|
|
void consume();
|
|
void memoize(ANTLR_MARKER ruleIndex, ANTLR_MARKER ruleParseStart);
|
|
bool haveParsedRule(ANTLR_MARKER ruleIndex);
|
|
|
|
/** Pointer to a function that sets the charstream source for the lexer and
|
|
* causes it to be reset.
|
|
*/
|
|
void setCharStream(InputStreamType* input);
|
|
|
|
/*!
|
|
* \brief
|
|
* Change to a new input stream, remembering the old one.
|
|
*
|
|
* \param lexer
|
|
* Pointer to the lexer instance to switch input streams for.
|
|
*
|
|
* \param input
|
|
* New input stream to install as the current one.
|
|
*
|
|
* Switches the current character input stream to
|
|
* a new one, saving the old one, which we will revert to at the end of this
|
|
* new one.
|
|
*/
|
|
void pushCharStream(InputStreamType* input);
|
|
|
|
/*!
|
|
* \brief
|
|
* Stops using the current input stream and reverts to any prior
|
|
* input stream on the stack.
|
|
*
|
|
* \param lexer
|
|
* Description of parameter lexer.
|
|
*
|
|
* Pointer to a function that abandons the current input stream, whether it
|
|
* is empty or not and reverts to the previous stacked input stream.
|
|
*
|
|
* \remark
|
|
* The function fails silently if there are no prior input streams.
|
|
*/
|
|
void popCharStream();
|
|
|
|
/** Function that emits (a copy of ) the supplied token as the next token in
|
|
* the stream.
|
|
*/
|
|
void emit(const CommonTokenType* token);
|
|
|
|
/** Pointer to a function that constructs a new token from the lexer stored information
|
|
*/
|
|
CommonTokenType* emit();
|
|
|
|
/** Pointer to a function that attempts to match and consume the specified string from the input
|
|
* stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
|
|
* with 0xFFFFFFFF, which is an invalid UTF32 character
|
|
*/
|
|
bool matchs(ANTLR_UCHAR* string);
|
|
|
|
/** Pointer to a function that matches and consumes the specified character from the input stream.
|
|
* The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
|
|
* implementation is source encoding agnostic and so input streams do not generally need to
|
|
* override the default implmentation.
|
|
*/
|
|
bool matchc(ANTLR_UCHAR c);
|
|
|
|
/** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
|
|
* but this would only be useful if the tokens were in tsome guaranteed order which is
|
|
* only going to happen with a hand crafted token set).
|
|
*/
|
|
bool matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high);
|
|
|
|
/** Pointer to a function that matches the next token/char in the input stream
|
|
* regardless of what it actaully is.
|
|
*/
|
|
void matchAny();
|
|
|
|
/** Pointer to a function that recovers from an error found in the input stream.
|
|
* Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
|
|
* be from a mismatched token that the (*match)() could not recover from.
|
|
*/
|
|
void recover();
|
|
|
|
/** Function to return the current line number in the input stream
|
|
*/
|
|
ANTLR_UINT32 getLine();
|
|
ANTLR_MARKER getCharIndex();
|
|
ANTLR_UINT32 getCharPositionInLine();
|
|
|
|
/** Function to return the text so far for the current token being generated
|
|
*/
|
|
StringType getText();
|
|
|
|
//Other utility functions
|
|
void fillExceptionData( ExceptionBaseType* ex );
|
|
|
|
/** Default lexer error handler (works for 8 bit streams only!!!)
|
|
*/
|
|
void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex);
|
|
void exConstruct();
|
|
TokenType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
|
|
ANTLR_UINT32 expectedTokenType, BitsetListType* follow);
|
|
|
|
/** Pointer to a function that knows how to free the resources of a lexer
|
|
*/
|
|
~Lexer();
|
|
};
|
|
|
|
ANTLR_END_NAMESPACE()
|
|
|
|
#include "antlr3lexer.inl"
|
|
|
|
#endif
|