You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

238 lines
9.0 KiB

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
// rbbirb.h
//
// Copyright (C) 2002-2008, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for several classes from the
// Rule Based Break Iterator rule builder.
//
#ifndef RBBIRB_H
#define RBBIRB_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include <utility>
#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "unicode/uniset.h"
#include "unicode/parseerr.h"
#include "uhash.h"
#include "uvector.h"
#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
// looks up references to $variables within a set.
U_NAMESPACE_BEGIN
class RBBIRuleScanner;
struct RBBIRuleTableEl;
class RBBISetBuilder;
class RBBINode;
class RBBITableBuilder;
//--------------------------------------------------------------------------------
//
// RBBISymbolTable. Implements SymbolTable interface that is used by the
// UnicodeSet parser to resolve references to $variables.
//
//--------------------------------------------------------------------------------
class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
public: // of these structs for each entry.
RBBISymbolTableEntry();
UnicodeString key;
RBBINode *val;
~RBBISymbolTableEntry();
private:
RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
};
class RBBISymbolTable : public UMemory, public SymbolTable {
private:
const UnicodeString &fRules;
UHashtable *fHashTable;
RBBIRuleScanner *fRuleScanner;
// These next two fields are part of the mechanism for passing references to
// already-constructed UnicodeSets back to the UnicodeSet constructor
// when the pattern includes $variable references.
const UnicodeString ffffString; // = "/uffff"
UnicodeSet *fCachedSetLookup;
public:
// API inherited from class SymbolTable
virtual const UnicodeString* lookup(const UnicodeString& s) const;
virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
virtual UnicodeString parseReference(const UnicodeString& text,
ParsePosition& pos, int32_t limit) const;
// Additional Functions
RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
virtual ~RBBISymbolTable();
virtual RBBINode *lookupNode(const UnicodeString &key) const;
virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err);
#ifdef RBBI_DEBUG
virtual void rbbiSymtablePrint() const;
#else
// A do-nothing inline function for non-debug builds. Member funcs can't be empty
// or the call sites won't compile.
int32_t fFakeField;
#define rbbiSymtablePrint() fFakeField=0;
#endif
private:
RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
};
//--------------------------------------------------------------------------------
//
// class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
//
//--------------------------------------------------------------------------------
class RBBIRuleBuilder : public UMemory {
public:
// Create a rule based break iterator from a set of rules.
// This function is the main entry point into the rule builder. The
// public ICU API for creating RBBIs uses this function to do the actual work.
//
static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules,
UParseError *parseError,
UErrorCode &status);
public:
// The "public" functions and data members that appear below are accessed
// (and shared) by the various parts that make up the rule builder. They
// are NOT intended to be accessed by anything outside of the
// rule builder implementation.
RBBIRuleBuilder(const UnicodeString &rules,
UParseError *parseErr,
UErrorCode &status
);
virtual ~RBBIRuleBuilder();
/**
* Build the state tables and char class Trie from the source rules.
*/
RBBIDataHeader *build(UErrorCode &status);
/**
* Fold together redundant character classes (table columns) and
* redundant states (table rows). Done after initial table generation,
* before serializing the result.
*/
void optimizeTables();
char *fDebugEnv; // controls debug trace output
UErrorCode *fStatus; // Error reporting. Keeping status
UParseError *fParseError; // here avoids passing it everywhere.
const UnicodeString &fRules; // The rule string that we are compiling
UnicodeString fStrippedRules; // The rule string, with comments stripped.
RBBIRuleScanner *fScanner; // The scanner.
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
RBBINode *fReverseTree; // then manipulated by subsequent steps.
RBBINode *fSafeFwdTree;
RBBINode *fSafeRevTree;
RBBINode **fDefaultTree; // For rules not qualified with a !
// the tree to which they belong to.
UBool fChainRules; // True for chained Unicode TR style rules.
// False for traditional regexp rules.
UBool fLBCMNoChain; // True: suppress chaining of rules on
// chars with LineBreak property == CM.
UBool fLookAheadHardBreak; // True: Look ahead matches cause an
// immediate break, no continuing for the
// longest match.
RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
UVector *fUSetNodes; // Vector of all uset nodes.
RBBITableBuilder *fForwardTable; // State transition table, build time form.
UVector *fRuleStatusVals; // The values that can be returned
// from getRuleStatus().
RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
// data tables..
private:
RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
};
//----------------------------------------------------------------------------
//
// RBBISetTableEl is an entry in the hash table of UnicodeSets that have
// been encountered. The val Node will be of nodetype uset
// and contain pointers to the actual UnicodeSets.
// The Key is the source string for initializing the set.
//
// The hash table is used to avoid creating duplicate
// unnamed (not $var references) UnicodeSets.
//
// Memory Management:
// The Hash Table owns these RBBISetTableEl structs and
// the key strings. It does NOT own the val nodes.
//
//----------------------------------------------------------------------------
struct RBBISetTableEl {
UnicodeString *key;
RBBINode *val;
};
/**
* A pair of ints, used to bundle pairs of states or pairs of character classes.
*/
typedef std::pair<int32_t, int32_t> IntPair;
//----------------------------------------------------------------------------
//
// RBBIDebugPrintf Printf equivalent, for debugging output.
// Conditional compilation of the implementation lets us
// get rid of the stdio dependency in environments where it
// is unavailable.
//
//----------------------------------------------------------------------------
#ifdef RBBI_DEBUG
#include <stdio.h>
#define RBBIDebugPrintf printf
#define RBBIDebugPuts puts
#else
#undef RBBIDebugPrintf
#define RBBIDebugPuts(arg)
#endif
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif