Skip to content

Commit

Permalink
abort compilation of regex-based pattern matching when the nfa to dfa…
Browse files Browse the repository at this point in the history
… translation degenerates into huge dfa states (morganstanley#143)
  • Loading branch information
smunix authored and kthielen committed Jul 16, 2018
1 parent 452eb7e commit ec332c9
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 6 deletions.
10 changes: 9 additions & 1 deletion include/hobbes/eval/cc.H
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,10 @@ public:
bool alwaysLowerPrimMatchTables() const;
void buildColumnwiseMatches(bool f);
bool buildColumnwiseMatches() const;
void throwOnHugeRegexDFA(bool f);
bool throwOnHugeRegexDFA() const;
void regexDFAOverNFAMaxRatio(int f);
int regexDFAOverNFAMaxRatio() const;

// allow caller to gather a vector of unreachable rows arising from match compilation
UnreachableMatchRowsPtr unreachableMatchRowsPtr;
Expand All @@ -255,10 +259,14 @@ private:
bool lowerPrimMatchTables;
bool columnwiseMatches;

// abort compilation of regexes which translate into huge dfa transition states
bool shouldThrowOnHugeRegexDFA = false;
int dfaOverNfaMaxRatio = 4;

// the bound root type-def environment
TEnvPtr tenv;
TypeAliasMap typeAliases;

PolyTypePtr lookupVarType(const std::string& vname) const;

// global variables
Expand Down
6 changes: 6 additions & 0 deletions lib/hobbes/eval/cc.C
Original file line number Diff line number Diff line change
Expand Up @@ -632,5 +632,11 @@ bool cc::alwaysLowerPrimMatchTables() const { return this->lowerPrimMatchTables;
void cc::buildColumnwiseMatches(bool f) { this->columnwiseMatches = f; }
bool cc::buildColumnwiseMatches() const { return this->columnwiseMatches; }

void cc::throwOnHugeRegexDFA(bool f) { this->shouldThrowOnHugeRegexDFA = f; }
bool cc::throwOnHugeRegexDFA() const { return this-> shouldThrowOnHugeRegexDFA; }

void cc::regexDFAOverNFAMaxRatio(int f) { this->dfaOverNfaMaxRatio = f; }
int cc::regexDFAOverNFAMaxRatio() const { return this->dfaOverNfaMaxRatio; }

}

15 changes: 10 additions & 5 deletions lib/hobbes/lang/pat/regex.C
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ typedef std::map<stateset, state> Nss2Ds;

// create a DFA state from a set of NFA states
// (or if it's already been made, just return the existing state)
state dfaState(const NFA& nfa, const EpsClosure& ec, Nss2Ds* nss2ds, DFA* dfa, const stateset& ss, RStates* rstates) {
state dfaState(const cc* c, const NFA& nfa, const EpsClosure& ec, Nss2Ds* nss2ds, DFA* dfa, const stateset& ss, RStates* rstates) {
// did we already make this state? if so, just return it
auto didIt = nss2ds->find(ss);
if (didIt != nss2ds->end()) {
Expand All @@ -724,12 +724,17 @@ state dfaState(const NFA& nfa, const EpsClosure& ec, Nss2Ds* nss2ds, DFA* dfa, c
// we need to make this state -- allocate it and remember it
state result = dfa->size();
dfa->resize(dfa->size() + 1);

if (c->throwOnHugeRegexDFA() and c->regexDFAOverNFAMaxRatio() > 0 and c->regexDFAOverNFAMaxRatio() > (dfa->size() / nfa.size() > size_t(c->regexDFAOverNFAMaxRatio()))) {
throw std::runtime_error("regexes DFA over NFA Max ratio was breached");
}

(*nss2ds)[ss] = result;

// ok, how can we transition out of here?
// for each case, we'll go to a set of NFA states (recursively)
for (auto cr : usedCharRanges(nfa, ss)) {
auto ns = dfaState(nfa, ec, nss2ds, dfa, nfaTransition(nfa, ec, ss, cr), rstates);
auto ns = dfaState(c, nfa, ec, nss2ds, dfa, nfaTransition(nfa, ec, ss, cr), rstates);
(*dfa)[result].chars.insert(cr, ns);
}

Expand Down Expand Up @@ -759,15 +764,15 @@ state dfaState(const NFA& nfa, const EpsClosure& ec, Nss2Ds* nss2ds, DFA* dfa, c
return result;
}

void disambiguate(const NFA& nfa, DFA* dfa, RStates* rstates) {
void disambiguate(const cc* c, const NFA& nfa, DFA* dfa, RStates* rstates) {
// determine eps* for this NFA
EpsClosure ec;
findEpsClosure(nfa, &ec);

// starting from the eps* start state,
// follow non-eps transitions to eps* successor states
Nss2Ds nss2ds;
dfaState(nfa, ec, &nss2ds, dfa, epsState(ec, 0), rstates);
dfaState(c, nfa, ec, &nss2ds, dfa, epsState(ec, 0), rstates);
}

/*****************************
Expand Down Expand Up @@ -1270,7 +1275,7 @@ CRegexes makeRegexFn(cc* c, const Regexes& regexes, const LexicalAnnotation& roo
// now map this NFA to a DFA
DFA dfa;
RStates fstates;
disambiguate(nfa, &dfa, &fstates);
disambiguate(c, nfa, &dfa, &fstates);

// make all char ranges compact and minimize the results to avoid redundant work in the caller
mergeCharRangesAndEqResults(&dfa, fstates, &result.rstates);
Expand Down

0 comments on commit ec332c9

Please sign in to comment.