Files
textmate/Frameworks/parse/src/parse.cc
Allan Odgaard cab42a83c5 Use injection patterns from “group rules”
For example if one grammar includes another, the included grammar works as a group rule and previously had its injections ignored.
2013-08-20 18:43:18 +02:00

446 lines
15 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "grammar.h"
#include "private.h"
#include <regexp/regexp.h>
#include <regexp/format_string.h>
#include <bundles/bundles.h>
#include <oak/oak.h>
OAK_DEBUG_VAR(Parser);
OAK_DEBUG_VAR(Parser_Flow);
namespace parse
{
size_t rule_t::rule_id_counter = 0;
bool equal (stack_ptr lhs, stack_ptr rhs)
{
return lhs == rhs || lhs && rhs && *lhs == *rhs;
}
static bool pattern_is_format_string (std::string const& ptrn)
{
bool res = oak::contains(ptrn.begin(), ptrn.end(), '$');
D(DBF_Parser, bug("%s: %s\n", ptrn.c_str(), BSTR(res)););
return res;
}
static OnigOptionType anchor_options (bool isFirstLine, bool isGPos, char const* first, char const* last)
{
OnigOptionType res = ONIG_OPTION_NONE;
if(!isFirstLine)
res |= ONIG_OPTION_NOTBOS;
if(!isGPos)
res |= ONIG_OPTION_NOTGPOS;
if(first != last && last[-1] == '\n')
res |= ONIG_OPTION_NOTEOS;
return res;
}
template <typename _OutputIter>
_OutputIter escape_regexp (char const* it, char const* last, _OutputIter out)
{
DB(std::string tmp);
DB(std::string org(it, last));
static char const* special = "\\|([{}]).?*+^$";
while(it != last)
{
if(strchr(special, *it))
{
DB(tmp += '\\');
*out++ = '\\';
}
DB(tmp += *it);
*out++ = *it++;
}
D(DBF_Parser, bug("%s → %s\n", org.c_str(), tmp.c_str()););
return out;
}
static std::string expand_back_references (std::string const& ptrn, regexp::match_t const& m)
{
bool escape = false;
std::string res;
iterate(it, ptrn)
{
if(escape && isdigit(*it))
{
int i = digittoint(*it);
if(!m.empty(i))
escape_regexp(m.buffer() + m.begin(i), m.buffer() + m.end(i), back_inserter(res));
escape = false;
continue;
}
if(escape)
res += '\\';
if(!(escape = !escape && *it == '\\'))
res += *it;
}
D(DBF_Parser, bug("%s → %s\n", ptrn.c_str(), res.c_str()););
return res;
}
bool stack_t::operator== (stack_t const& rhs) const
{
if(*rule != *rhs.rule || scope != rhs.scope)
return false;
if(while_pattern != rhs.while_pattern || end_pattern != rhs.end_pattern)
return false;
if((!parent && rhs.parent) || (parent && (!rhs.parent || *parent != *rhs.parent)))
return false;
return true;
}
bool stack_t::operator!= (stack_t const& rhs) const
{
return !(*this == rhs);
}
struct ranked_match_t
{
ranked_match_t (rule_t const* rule, regexp::match_t const& match, size_t rank, bool is_end_pattern = false) : rule(rule), match(match), rank(rank), is_end_pattern(is_end_pattern) { }
rule_t const* rule;
regexp::match_t match;
size_t rank;
bool is_end_pattern;
WATCH_LEAKS(ranked_match_t);
bool operator< (ranked_match_t const& rhs) const
{
return match.begin() == rhs.match.begin() ? rank < rhs.rank : match.begin() < rhs.match.begin();
}
};
static scope::scope_t create_scope (scope::scope_t const& current_scope, std::string const& format_string, regexp::match_t const& match)
{
return current_scope.append(pattern_is_format_string(format_string) ? format_string::expand(format_string, match.captures()) : format_string, true);
}
static void apply_captures (scope::scope_t scope, regexp::match_t const& m, repository_ptr const& captures, std::map<size_t, scope::scope_t>& res, bool firstLine)
{
if(!captures)
return;
std::multimap<std::pair<size_t, ssize_t>, rule_ptr> rules;
repository_t::const_iterator ruleIter = captures->begin();
std::multimap<std::string, std::pair<size_t, size_t> >::const_iterator indexIter = m.capture_indices().begin();
while(ruleIter != captures->end() && indexIter != m.capture_indices().end())
{
if(ruleIter->first == indexIter->first)
rules.insert(std::make_pair(std::make_pair(indexIter->second.first, -(indexIter->second.second - indexIter->second.first)), ruleIter->second));
if(ruleIter->first < indexIter->first)
++ruleIter;
else ++indexIter;
}
std::vector< std::pair<size_t, scope::scope_t> > stack;
iterate(it, rules)
{
size_t from = it->first.first;
for(; !stack.empty() && stack.back().first <= from; stack.pop_back())
scope = res[stack.back().first] = stack.back().second;
size_t to = it->first.first - it->first.second;
stack.push_back(std::make_pair(to, scope));
rule_ptr const& rule = it->second;
if(rule->scope_string != NULL_STR)
scope = res[from] = create_scope(scope, rule->scope_string, m);
if(!rule->children.empty())
{
D(DBF_Parser, bug("re-parse: %.*s (range %zu-%zu)\n", (int)(to - from), m.buffer() + from, from, to););
parse::stack_ptr stack(new parse::stack_t(rule.get(), scope));
stack->anchor = from;
parse(m.buffer(), m.buffer() + to, stack, res, firstLine, from);
}
}
for(; !stack.empty(); stack.pop_back())
scope = res[stack.back().first] = stack.back().second;
}
static void collect_children (std::vector<rule_ptr> const& children, std::vector<rule_t*>& res, std::vector<rule_t*>* groups);
static void collect_rule (rule_t* rule, std::vector<rule_t*>& res, std::vector<rule_t*>* groups)
{
while(rule && rule->include && !rule->included)
{
if(groups)
{
rule->included = true;
groups->push_back(rule);
}
rule = rule->include;
}
if(!rule || rule->included)
return;
if(rule->match_pattern)
{
rule->included = true;
res.push_back(rule);
}
else if(!rule->children.empty())
{
if(groups)
{
rule->included = true;
groups->push_back(rule);
}
collect_children(rule->children, res, groups);
}
}
static void collect_children (std::vector<rule_ptr> const& children, std::vector<rule_t*>& res, std::vector<rule_t*>* groups)
{
for(rule_ptr const& rule : children)
collect_rule(rule.get(), res, groups);
}
static void collect_injections (stack_ptr const& stack, scope::context_t const& scope, std::vector<rule_t*> const& groups, std::vector<rule_t*>& res)
{
for(stack_ptr node = stack; node; node = node->parent)
{
for(auto const& pair : node->rule->injections)
{
if(pair.first.does_match(scope))
collect_rule(pair.second.get(), res, nullptr);
}
}
for(rule_t const* rule : groups)
{
if(rule->is_root) // already handled via the stack
continue;
for(auto const& pair : rule->injections)
{
if(pair.first.does_match(scope))
collect_rule(pair.second.get(), res, nullptr);
}
}
}
static size_t apply_rules (size_t rank, std::vector<rule_t*> const& rules, char const* first, char const* last, OnigOptionType options, size_t i, std::set<ranked_match_t>& res, std::map<size_t, regexp::match_t>& match_cache)
{
for(rule_t* rule : rules)
{
rule->included = false;
auto it = match_cache.find(rule->rule_id);
if(it != match_cache.end())
{
if(it->second)
res.emplace(rule, it->second, ++rank);
}
else
{
auto match = regexp::search(rule->match_pattern, first, last, first + i, last, options);
if(!rule->match_pattern_is_anchored)
match_cache.emplace(rule->rule_id, match);
if(match)
res.emplace(rule, match, ++rank);
}
}
return rank;
}
static void collect_rules (char const* first, char const* last, size_t i, bool firstLine, stack_ptr const& stack, std::set<ranked_match_t>& res, std::map<size_t, regexp::match_t>& match_cache)
{
std::vector<rule_t*> rules, groups, injectedRulesPre, injectedRulesPost;
collect_children(stack->rule->children, rules, &groups);
collect_injections(stack, scope::context_t(stack->scope, ""), groups, injectedRulesPre);
collect_injections(stack, scope::context_t("", stack->scope), groups, injectedRulesPost);
for(rule_t* rule : groups)
rule->included = false;
// ============================
// = Match rules against text =
// ============================
res.clear();
OnigOptionType const options = anchor_options(firstLine, stack->anchor == i, first, last);
size_t rank = apply_rules(0, injectedRulesPre, first, last, options, i, res, match_cache);
size_t endPatternRank = ++rank;
rank = apply_rules(rank, rules, first, last, options, i, res, match_cache);
if(stack->end_pattern)
{
D(DBF_Parser, bug("end pattern: %s\n", to_s(stack->end_pattern).c_str()););
if(regexp::match_t const& match = regexp::search(stack->end_pattern, first, last, first + i, last, options))
res.emplace(stack->rule, match, stack->apply_end_last ? ++rank : endPatternRank, true);
}
rank = apply_rules(rank, injectedRulesPost, first, last, options, i, res, match_cache);
}
static bool has_cycle (size_t rule_id, size_t i, stack_ptr const& stack)
{
if(!stack->zw_begin_match || stack->anchor != i)
return false;
else if(rule_id == stack->rule->rule_id)
return true;
return stack->parent ? has_cycle(rule_id, i, stack->parent) : false;
}
stack_ptr parse (char const* first, char const* last, stack_ptr stack, std::map<size_t, scope::scope_t>& scopes, bool firstLine, size_t i)
{
D(DBF_Parser_Flow, bug("%.*s", (int)(last - first), first););
// ==============================
// = apply the while patterns =
// ==============================
std::vector<stack_ptr> while_rules;
for(stack_ptr node = stack; node->while_pattern; node = node->parent)
while_rules.push_back(node);
scope::scope_t scope = scopes[i] = while_rules.empty() ? stack->scope : while_rules.back()->parent->scope;
D(DBF_Parser, bug("%s, offset %zu, %zu while rules\n", to_s(scope).c_str(), i, while_rules.size()););
riterate(it, while_rules)
{
if(regexp::match_t const& m = regexp::search((*it)->while_pattern, first, last, first + i))
{
D(DBF_Parser_Flow, bug("while match %zu-%zu\n", m.begin(), m.end()););
rule_t const* rule = (*it)->rule;
if(rule->scope_string != NULL_STR)
scope = scopes[m.begin()] = create_scope(scope, rule->scope_string, m);
apply_captures(scope, m, rule->while_captures ?: rule->captures, scopes, firstLine);
if(rule->content_scope_string != NULL_STR)
scope = scopes[m.end()] = create_scope(scope, rule->content_scope_string, m);
stack->anchor = i = m.end();
continue;
}
stack = (*it)->parent;
if(stack->while_pattern)
stack->anchor = i;
break;
}
// ======================
// = Parse rest of line =
// ======================
std::set<ranked_match_t> rules;
std::map<size_t, regexp::match_t> match_cache;
collect_rules(first, last, i, firstLine, stack, rules, match_cache);
D(DBF_Parser, bug("%zu rules (out of %zu), parse: %.*s", rules.size(), stack->rule->children.size(), (int)(last - first - i), first + i););
while(!rules.empty())
{
DB(
D(DBF_Parser, bug("offset: %zu\n", i););
iterate(it, rules)
D(DBF_Parser, bug("\t%zu-%zu, %s\n", it->match.begin(), it->match.end(), to_s(it->rule->match_pattern).c_str()););
)
ranked_match_t m = *rules.begin();
rules.erase(rules.begin());
if(m.match.begin() < i)
{
regexp::pattern_t const& ptrn = m.is_end_pattern ? stack->end_pattern : m.rule->match_pattern;
if(m.match = regexp::search(ptrn, first, last, first + i, last, anchor_options(firstLine, stack->anchor == i, first, last)))
rules.insert(m);
continue;
}
i = m.match.end();
D(DBF_Parser_Flow, bug("match %2zu-%2zu: %s\n", m.match.begin(), m.match.end(), m.rule->scope_string != NULL_STR ? m.rule->scope_string.c_str() : "(untitled)"););
rule_t const* rule = m.rule;
if(m.is_end_pattern)
{
scope = stack->scope;
if(stack->rule->content_scope_string != NULL_STR)
scope = scopes[m.match.begin()] = scope.parent();
apply_captures(scope, m.match, rule->end_captures ?: rule->captures, scopes, firstLine);
bool nothingMatched = stack->zw_begin_match && stack->anchor == i;
stack = stack->parent;
scope = scopes[m.match.end()] = stack->scope;
D(DBF_Parser_Flow, bug("leaving, new scope %s\n", to_s(scope).c_str()););
if(nothingMatched) // we left a begin/end rule but havent parsed any bytes, so were destined to repeat this mistake
{
fprintf(stderr, "*** no bytes parsed by rule %s, begin = %s, end = %s, position %zu for line: %.*s\n", rule->scope_string != NULL_STR ? rule->scope_string.c_str() : "(untitled)", rule->match_string.c_str(), rule->end_string.c_str(), i, (int)(last - first), first);
break;
}
}
else if(!rule->children.empty() || rule->while_string != NULL_STR || rule->end_string != NULL_STR) // begin-part of rule
{
if(m.match.empty() && has_cycle(rule->rule_id, i, stack))
{
fprintf(stderr, "*** no bytes matched and recursive include of rule %s, begin = %s, end = %s, position %zu for line: %.*s\n", rule->scope_string != NULL_STR ? rule->scope_string.c_str() : "(untitled)", rule->match_string.c_str(), rule->end_string.c_str(), i, (int)(last - first), first);
break;
}
if(rule->scope_string != NULL_STR)
scope = scopes[m.match.begin()] = create_scope(scope, rule->scope_string, m.match);
apply_captures(scope, m.match, rule->begin_captures ?: rule->captures, scopes, firstLine);
if(rule->content_scope_string != NULL_STR)
scope = scopes[m.match.end()] = create_scope(scope, rule->content_scope_string, m.match);
stack.reset(new stack_t(rule, scope, stack));
stack->while_pattern = rule->while_pattern;
stack->end_pattern = rule->end_pattern;
stack->apply_end_last = rule->apply_end_last == "1";
stack->anchor = i;
stack->zw_begin_match = m.match.empty();
stack->parent->anchor = SIZE_T_MAX;
if(!rule->while_pattern && rule->while_string != NULL_STR)
stack->while_pattern = expand_back_references(rule->while_string, m.match);
if(!rule->end_pattern && rule->end_string != NULL_STR)
stack->end_pattern = expand_back_references(rule->end_string, m.match);
D(DBF_Parser_Flow, bug("descending, new scope %s\n", to_s(scope).c_str()););
}
else // regular match-rule
{
if(m.match.empty())
{
fprintf(stderr, "*** no bytes parsed by rule %s, match = %s, position %zu for line: %.*s\n", rule->scope_string != NULL_STR ? rule->scope_string.c_str() : "(untitled)", rule->match_string.c_str(), i, (int)(last - first), first);
continue; // do not re-apply since this matched zero characters
}
if(rule->scope_string != NULL_STR)
{
scope::scope_t const& newScope = create_scope(scope, rule->scope_string, m.match);
scopes[m.match.begin()] = newScope;
apply_captures(newScope, m.match, rule->captures, scopes, firstLine);
}
else
{
apply_captures(scope, m.match, rule->captures, scopes, firstLine);
}
scopes[m.match.end()] = scope;
if(m.match = regexp::search(m.rule->match_pattern, first, last, first + i, last, anchor_options(firstLine, stack->anchor == i, first, last)))
rules.insert(m);
continue; // no context change, so skip finding rules for this context
}
D(DBF_Parser, bug("%zu rules before collecting\n", rules.size()););
collect_rules(first, last, i, firstLine, stack, rules, match_cache);
D(DBF_Parser, bug("%zu rules after collecting\n", rules.size()););
}
D(DBF_Parser_Flow, bug("line done (%zu rules)\n", rules.size()););
stack->anchor = first + stack->anchor == last ? 0 : SIZE_T_MAX;
return stack;
}
}