Pattern class holds a regex pattern and its compiled FSM opcode table or code for the reflex::Matcher engine. More...
#include <pattern.h>
Classes | |
struct | Chars |
Set of chars and meta chars. More... | |
struct | Const |
Common constants. More... | |
struct | DFA |
DFA created by subset construction from regex patterns. More... | |
struct | HFA |
Indexing hash finite state automaton for indexed file search. More... | |
struct | ModConst |
Modifiers 'i', 'm', 'q', 's', 'u' (enable) 'I', 'M', 'Q', 'S', 'U' (disable) More... | |
struct | Option |
Global modifier modes, syntax flags, and compiler options. More... | |
struct | Position |
Finite state machine construction position information. More... | |
Public Types | |
typedef uint8_t | Pred |
predict match bits More... | |
typedef uint16_t | Hash |
hash value type, max value is Const::HASH More... | |
typedef uint32_t | Index |
index into opcodes array Pattern::opc_ and subpattern indexing More... | |
typedef uint32_t | Accept |
group capture index More... | |
typedef uint32_t | Opcode |
32 bit opcode word More... | |
typedef void(* | FSM) (class Matcher &) |
Public Member Functions | |
Pattern () | |
Construct an unset pattern. More... | |
Pattern (const char *regex, const char *options=NULL) | |
Construct a pattern object given a regex string. More... | |
Pattern (const char *regex, const std::string &options) | |
Construct a pattern object given a regex string. More... | |
Pattern (const std::string ®ex, const char *options=NULL) | |
Construct a pattern object given a regex string. More... | |
Pattern (const std::string ®ex, const std::string &options) | |
Construct a pattern object given a regex string. More... | |
Pattern (const Opcode *code, const uint8_t *pred=NULL) | |
Construct a pattern object given an opcode table. More... | |
Pattern (FSM fsm, const uint8_t *pred=NULL) | |
Construct a pattern object given a function pointer to FSM code. More... | |
Pattern (const Pattern &pattern) | |
Copy constructor. More... | |
virtual | ~Pattern () |
Destructor, deletes internal code array when owned and allocated. More... | |
void | clear () |
Clear and delete pattern data. More... | |
Pattern & | assign (const char *regex, const char *options=NULL) |
Assign a (new) pattern. More... | |
Pattern & | assign (const char *regex, const std::string &options) |
Assign a (new) pattern. More... | |
Pattern & | assign (const std::string ®ex, const char *options=NULL) |
Assign a (new) pattern. More... | |
Pattern & | assign (const std::string ®ex, const std::string &options) |
Assign a (new) pattern. More... | |
Pattern & | assign (const Opcode *code, const uint8_t *pred=NULL) |
Assign a (new) pattern. More... | |
Pattern & | assign (FSM fsm, const uint8_t *pred=NULL) |
Assign a (new) pattern. More... | |
Pattern & | operator= (const Pattern &pattern) |
Assign a (new) pattern. More... | |
Pattern & | operator= (const char *regex) |
Assign a (new) pattern. More... | |
Pattern & | operator= (const std::string ®ex) |
Assign a (new) pattern. More... | |
Pattern & | operator= (const Opcode *code) |
Assign a (new) pattern. More... | |
Pattern & | operator= (FSM fsm) |
Assign a (new) pattern. More... | |
Accept | size () const |
Get the number of subpatterns of this pattern object. More... | |
bool | empty () const |
Return true if this pattern is not assigned. More... | |
const std::string | operator[] (Accept choice) const |
Get subpattern regex of this pattern object or the whole regex with index 0. More... | |
bool | reachable (Accept choice) const |
Check if subpattern is reachable by a match. More... | |
size_t | nodes () const |
Get the number of finite state machine nodes (vertices). More... | |
size_t | edges () const |
Get the number of finite state machine edges (transitions on input characters). More... | |
size_t | words () const |
Get the code size in number of words. More... | |
size_t | hashes () const |
Get the total number of indexing hash tables constructed for the optional HFA. More... | |
float | parse_time () const |
Get elapsed regex parsing and analysis time. More... | |
float | nodes_time () const |
Get elapsed DFA vertices construction time. More... | |
float | edges_time () const |
Get elapsed DFA edges construction time. More... | |
float | words_time () const |
Get elapsed code words assembly time. More... | |
float | analysis_time () const |
Get elapsed time of DFA analysis to predict matches and construct an optional HFA. More... | |
bool | has_hfa () const |
bool | match_hfa (const uint8_t *indexed, size_t size) const |
Static Public Member Functions | |
static bool | predict_match (const Pred pmh[], const char *s, size_t n) |
Returns true when match is predicted, based on s[0..3..e-1] (e >= s + 4). More... | |
static size_t | predict_match (const Pred pma[], const char *s) |
Returns zero when match is predicted (removed shift distance return, now just returns 0 or 1). More... | |
static uint8_t | frequency (uint8_t c) |
Relative frequency of English letters with upper/lower-case ratio = 0.0563, punctuation and UTF-8 bytes. More... | |
Protected Member Functions | |
virtual void | error (regex_error_type code, size_t pos=0) const |
Throw an error. More... | |
Private Types | |
enum | Meta { META_MIN = 0x100, META_WBB = 0x101, META_WBE = 0x102, META_NWB = 0x103, META_NWE = 0x104, META_BWB = 0x105, META_EWB = 0x106, META_BWE = 0x107, META_EWE = 0x108, META_BOL = 0x109, META_EOL = 0x10A, META_BOB = 0x10B, META_EOB = 0x10C, META_UND = 0x10D, META_IND = 0x10E, META_DED = 0x10F, META_MAX } |
Meta characters. More... | |
typedef uint8_t | Mod |
typedef uint16_t | Char |
typedef uint8_t | Lazy |
typedef uint16_t | Iter |
typedef uint16_t | Lookahead |
typedef std::set< Lookahead > | Lookaheads |
typedef uint32_t | Location |
typedef ORanges< Location > | Locations |
typedef std::map< int, Locations > | Map |
typedef Locations | Mods[10] |
typedef std::vector< Position > | Lazypos |
typedef std::vector< Position > | Positions |
typedef std::map< Position, Positions > | Follow |
typedef std::pair< Chars, Positions > | Move |
typedef std::list< Move > | Moves |
Private Member Functions | |
void | init (const char *options, const uint8_t *pred=NULL) |
Initialize the pattern at construction. More... | |
void | init_options (const char *options) |
void | parse (Positions &startpos, Follow &followpos, Lazypos &lazypos, Mods modifiers, Map &lookahead) |
void | parse1 (bool begin, Location &loc, Positions &firstpos, Positions &lastpos, bool &nullable, Follow &followpos, Lazy &lazyidx, Lazypos &lazypos, Mods modifiers, Locations &lookahead, Iter &iter) |
void | parse2 (bool begin, Location &loc, Positions &firstpos, Positions &lastpos, bool &nullable, Follow &followpos, Lazy &lazyidx, Lazypos &lazypos, Mods modifiers, Locations &lookahead, Iter &iter) |
void | parse3 (bool begin, Location &loc, Positions &firstpos, Positions &lastpos, bool &nullable, Follow &followpos, Lazy &lazyidx, Lazypos &lazypos, Mods modifiers, Locations &lookahead, Iter &iter) |
void | parse4 (bool begin, Location &loc, Positions &firstpos, Positions &lastpos, bool &nullable, Follow &followpos, Lazy &lazyidx, Lazypos &lazypos, Mods modifiers, Locations &lookahead, Iter &iter) |
Char | parse_esc (Location &loc, Chars *chars=NULL) const |
void | compile (DFA::State *start, Follow &followpos, const Lazypos &lazypos, const Mods modifiers, const Map &lookahead) |
void | lazy (const Lazypos &lazypos, Positions &pos) const |
void | lazy (const Lazypos &lazypos, const Positions &pos, Positions &pos1) const |
void | greedy (Positions &pos) const |
void | trim_anchors (Positions &follow, const Position p) const |
void | trim_lazy (Positions *pos, const Lazypos &lazypos) const |
void | compile_transition (DFA::State *state, Follow &followpos, const Lazypos &lazypos, const Mods modifiers, const Map &lookahead, Moves &moves) const |
void | transition (Moves &moves, Chars &chars, const Positions &follow) const |
void | compile_list (Location loc, Chars &chars, const Mods modifiers) const |
void | posix (size_t index, Chars &chars) const |
void | flip (Chars &chars) const |
void | assemble (DFA::State *start) |
void | compact_dfa (DFA::State *start) |
void | encode_dfa (DFA::State *start) |
void | gencode_dfa (const DFA::State *start) const |
void | check_dfa_closure (const DFA::State *state, int nest, bool &peek, bool &prev) const |
void | gencode_dfa_closure (FILE *fd, const DFA::State *start, int nest, bool peek) const |
void | graph_dfa (const DFA::State *start) const |
void | export_code () const |
void | analyze_dfa (DFA::State *start) |
void | gen_predict_match (std::set< DFA::State * > states) |
void | gen_predict_match_start (std::set< DFA::State * > states, std::map< DFA::State *, ORanges< Hash > > &hashes) |
void | gen_predict_match_transitions (size_t level, DFA::State *state, const ORanges< Hash > &labels, std::map< DFA::State *, ORanges< Hash > > &hashes) |
void | gen_match_hfa (DFA::State *start) |
void | gen_match_hfa_start (DFA::State *start, HFA::State &index, HFA::StateHashes &hashes) |
bool | gen_match_hfa_transitions (size_t level, size_t &max_level, DFA::State *state, const HFA::HashRanges &previous, HFA::State &index, HFA::StateHashes &hashes) |
bool | match_hfa_transitions (size_t level, const HFA::Hashes &hashes, const uint8_t *indexed, size_t size, HFA::VisitSet &visit, HFA::VisitSet &next_visit, bool &accept) const |
void | write_predictor (FILE *fd) const |
void | write_namespace_open (FILE *fd) const |
void | write_namespace_close (FILE *fd) const |
size_t | find_at (Location loc, char c) const |
Char | at (Location k) const |
bool | eq_at (Location loc, const char *s) const |
Char | escape_at (Location loc) const |
Char | escapes_at (Location loc, const char *escapes) const |
Static Private Member Functions | |
static void | pos_insert (Positions &s1, const Positions &s2) |
static void | pos_add (Positions &s, const Position &e) |
static void | lazy_insert (Lazypos &s1, const Lazypos &s2) |
static void | lazy_add (Lazypos &s, const Lazy i, Location p) |
static bool | is_modified (Mod mod, const Mods modifiers, Location loc) |
static void | update_modified (Mod mod, Mods modifiers, Location from, Location to) |
static uint16_t | hash_pos (const Positions *pos) |
static bool | valid_goto_index (Index index) |
static bool | valid_take_index (Index index) |
static bool | valid_lookahead_index (Index index) |
static bool | is_meta (Char c) |
static Opcode | opcode_long (Index index) |
static Opcode | opcode_take (Index index) |
static Opcode | opcode_redo () |
static Opcode | opcode_tail (Index index) |
static Opcode | opcode_head (Index index) |
static Opcode | opcode_goto (Char lo, Char hi, Index index) |
static Opcode | opcode_halt () |
static bool | is_opcode_long (Opcode opcode) |
static bool | is_opcode_take (Opcode opcode) |
static bool | is_opcode_redo (Opcode opcode) |
static bool | is_opcode_tail (Opcode opcode) |
static bool | is_opcode_head (Opcode opcode) |
static bool | is_opcode_halt (Opcode opcode) |
static bool | is_opcode_goto (Opcode opcode) |
static bool | is_opcode_meta (Opcode opcode) |
static bool | is_opcode_goto (Opcode opcode, unsigned char c) |
static Char | meta_of (Opcode opcode) |
static Char | lo_of (Opcode opcode) |
static Char | hi_of (Opcode opcode) |
static Index | index_of (Opcode opcode) |
static Index | long_index_of (Opcode opcode) |
static Lookahead | lookahead_of (Opcode opcode) |
static Char | lowercase (Char c) |
convert to lower case if c is a letter a-z, A-Z. More... | |
static Char | uppercase (Char c) |
convert to upper case if c is a letter a-z, A-Z. More... | |
static Hash | hash (Hash h, uint8_t b) |
predict match hash 0 <= hash() < Const::HASH. More... | |
static Hash | indexhash (Hash h, uint8_t b) |
file indexing hash 0 <= indexhash() < 65536, must be additive: indexhash(x,b+1) = indexhash(x,b)+1 modulo 2^16. More... | |
Private Attributes | |
Option | opt_ |
pattern compiler options More... | |
HFA | hfa_ |
indexing hash finite state automaton More... | |
DFA | tfa_ |
tree DFA constructed from strings More... | |
DFA | dfa_ |
DFA constructed from regex with subset construction using firstpos/lastpos/followpos. More... | |
std::string | rex_ |
regular expression string More... | |
std::vector< Location > | end_ |
entries point to the subpattern's ending '|' or '\0' More... | |
std::vector< bool > | acc_ |
true if subpattern n is accepting (state is reachable) More... | |
size_t | vno_ |
number of finite state machine vertices |V| More... | |
size_t | eno_ |
number of finite state machine edges |E| More... | |
size_t | hno_ |
number of indexing hash tables (HFA edges) More... | |
const Opcode * | opc_ |
points to the table with compiled finite state machine opcodes More... | |
FSM | fsm_ |
function pointer to FSM code More... | |
Index | nop_ |
number of opcodes generated More... | |
Index | cut_ |
DFA s-t cut to improve predict match and HFA accuracy together with lbk_ and cbk_. More... | |
size_t | len_ |
length of chr_[], less or equal to 255 More... | |
size_t | min_ |
patterns after the prefix are at least this long but no more than 8 More... | |
size_t | pin_ |
number of needles More... | |
std::bitset< 256 > | cbk_ |
characters to look back over when lbk_ > 0, never includes More... | |
std::bitset< 256 > | fst_ |
the beginning characters of the pattern More... | |
char | chr_ [256] |
pattern prefix string or character needles for needle-based search More... | |
Pred | bit_ [256] |
bitap array More... | |
Pred | pmh_ [Const::HASH] |
predict-match hash array More... | |
Pred | pma_ [Const::HASH] |
predict-match array More... | |
uint16_t | lbk_ |
lookback distance or 0xffff unlimited lookback or 0 for no lookback (empty cbk_) More... | |
uint16_t | lbm_ |
loopback minimum distance when lbk_ > 0 More... | |
uint16_t | lcp_ |
primary least common character position in the pattern or 0xffff More... | |
uint16_t | lcs_ |
secondary least common character position in the pattern or 0xffff More... | |
size_t | bmd_ |
Boyer-Moore jump distance on mismatch, B-M is enabled when bmd_ > 0. More... | |
uint8_t | bms_ [256] |
Boyer-Moore skip array. More... | |
float | pms_ |
ms elapsed time to parse regex More... | |
float | vms_ |
ms elapsed time to compile DFA vertices More... | |
float | ems_ |
ms elapsed time to compile DFA edges More... | |
float | wms_ |
ms elapsed time to assemble code words More... | |
float | ams_ |
ms elapsed time to analyze DFA for predict match and HFA More... | |
size_t | npy_ |
entropy derived from the bitap array bit_[] More... | |
bool | one_ |
true if matching one string stored in chr_[] without meta/anchors More... | |
Friends | |
class | Matcher |
permit access by the reflex::Matcher engine More... | |
class | FuzzyMatcher |
permit access by the reflex::FuzzyMatcher engine More... | |
Pattern class holds a regex pattern and its compiled FSM opcode table or code for the reflex::Matcher engine.
typedef uint32_t reflex::Pattern::Accept |
group capture index
|
private |
|
private |
typedef void(* reflex::Pattern::FSM) (class Matcher &) |
function pointer to FSM code
typedef uint16_t reflex::Pattern::Hash |
hash value type, max value is Const::HASH
typedef uint32_t reflex::Pattern::Index |
index into opcodes array Pattern::opc_ and subpattern indexing
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
typedef uint32_t reflex::Pattern::Opcode |
32 bit opcode word
|
private |
typedef uint8_t reflex::Pattern::Pred |
predict match bits
|
private |
Meta characters.
|
inline |
Construct an unset pattern.
|
inlineexplicit |
Construct a pattern object given a regex string.
|
inline |
Construct a pattern object given a regex string.
|
inlineexplicit |
Construct a pattern object given a regex string.
|
inline |
Construct a pattern object given a regex string.
|
inlineexplicit |
Construct a pattern object given an opcode table.
|
inlineexplicit |
Construct a pattern object given a function pointer to FSM code.
|
inline |
Copy constructor.
pattern | pattern to copy |
|
inlinevirtual |
Destructor, deletes internal code array when owned and allocated.
|
inline |
|
private |
|
private |
|
inline |
Assign a (new) pattern.
|
inline |
Assign a (new) pattern.
|
inline |
Assign a (new) pattern.
|
inline |
Assign a (new) pattern.
Assign a (new) pattern.
Assign a (new) pattern.
|
private |
|
inline |
Clear and delete pattern data.
|
private |
|
private |
|
private |
|
private |
|
inline |
Get the number of finite state machine edges (transitions on input characters).
|
inline |
Get elapsed DFA edges construction time.
|
inline |
Return true if this pattern is not assigned.
|
private |
|
inlineprivate |
|
protectedvirtual |
Throw an error.
code | error code |
pos | optional location of the error in regex string Pattern::rex_ |
|
private |
|
inlineprivate |
|
private |
|
inlinestatic |
Relative frequency of English letters with upper/lower-case ratio = 0.0563, punctuation and UTF-8 bytes.
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
inline |
predict match hash 0 <= hash() < Const::HASH.
|
inlinestaticprivate |
|
inline |
file indexing hash 0 <= indexhash() < 65536, must be additive: indexhash(x,b+1) = indexhash(x,b)+1 modulo 2^16.
|
private |
Initialize the pattern at construction.
|
private |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
private |
convert to lower case if c is a letter a-z, A-Z.
bool reflex::Pattern::match_hfa | ( | const uint8_t * | indexed, |
size_t | size | ||
) | const |
|
private |
|
inline |
Get the number of finite state machine nodes (vertices).
|
inline |
Get elapsed DFA vertices construction time.
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inline |
Assign a (new) pattern.
|
inline |
Assign a (new) pattern.
const std::string reflex::Pattern::operator[] | ( | Accept | choice | ) | const |
Get subpattern regex of this pattern object or the whole regex with index 0.
|
private |
|
private |
|
private |
|
private |
|
private |
|
inline |
Get elapsed regex parsing and analysis time.
|
inlinestaticprivate |
|
private |
|
inlinestatic |
Returns true when match is predicted, based on s[0..3..e-1] (e >= s + 4).
|
inlinestatic |
Returns zero when match is predicted (removed shift distance return, now just returns 0 or 1).
|
inline |
Check if subpattern is reachable by a match.
|
inline |
Get the number of subpatterns of this pattern object.
|
private |
|
inlinestaticprivate |
convert to upper case if c is a letter a-z, A-Z.
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inlinestaticprivate |
|
inline |
Get the code size in number of words.
|
inline |
Get elapsed code words assembly time.
|
private |
|
private |
|
private |
|
friend |
permit access by the reflex::FuzzyMatcher engine
|
friend |
permit access by the reflex::Matcher engine
|
private |
true if subpattern n is accepting (state is reachable)
|
private |
bitap array
|
private |
Boyer-Moore jump distance on mismatch, B-M is enabled when bmd_ > 0.
|
private |
Boyer-Moore skip array.
|
private |
characters to look back over when lbk_ > 0, never includes
|
private |
pattern prefix string or character needles for needle-based search
|
private |
|
private |
DFA constructed from regex with subset construction using firstpos/lastpos/followpos.
|
private |
ms elapsed time to compile DFA edges
|
private |
entries point to the subpattern's ending '|' or '\0'
|
private |
number of finite state machine edges |E|
|
private |
function pointer to FSM code
|
private |
the beginning characters of the pattern
|
private |
indexing hash finite state automaton
|
private |
number of indexing hash tables (HFA edges)
|
private |
lookback distance or 0xffff unlimited lookback or 0 for no lookback (empty cbk_)
|
private |
loopback minimum distance when lbk_ > 0
|
private |
primary least common character position in the pattern or 0xffff
|
private |
secondary least common character position in the pattern or 0xffff
|
private |
length of chr_[], less or equal to 255
|
private |
patterns after the prefix are at least this long but no more than 8
|
private |
number of opcodes generated
|
private |
entropy derived from the bitap array bit_[]
|
private |
true if matching one string stored in chr_[] without meta/anchors
|
private |
points to the table with compiled finite state machine opcodes
|
private |
pattern compiler options
|
private |
number of needles
|
private |
predict-match array
|
private |
predict-match hash array
|
private |
ms elapsed time to parse regex
|
private |
regular expression string
|
private |
ms elapsed time to compile DFA vertices
|
private |
number of finite state machine vertices |V|
|
private |
ms elapsed time to assemble code words