Improve folder search performance

This relates to documents with many matches on the same (long) line.

Previously we would search for “end of line” per match found, which for a single-line document was effectively quadratic time complexity.

Furthermore we would make a string copy of the entire line for every match. This is now limited to 500 bytes (or longer if the match requires it) with the context before the match being up to 150 bytes (it’s using modulo so adjacent matches should generally show context starting from the same offset).

The test case for the optimization was a 3MB file with only 2 lines and 150,000 matches. After this commit the results are presented in about two seconds (Macbook Pro). Not impressive but much better than before :)
This commit is contained in:
Allan Odgaard
2016-06-16 19:43:32 +02:00
parent 41ed2817e7
commit 258e5449f1

View File

@@ -186,34 +186,36 @@ namespace find
std::string newlines = text::estimate_line_endings(std::begin(text), std::end(text));
newlines = newlines == kMIX ? kLF : newlines;
size_t bol = 0, nextLine = bol, lfCount = 0;
size_t bol = 0, lfCount = 0;
size_t nextLine = text.find(newlines, bol);
for(auto const& it : ranges)
{
while(true)
while(nextLine != std::string::npos && nextLine + newlines.size() <= it.from)
{
nextLine = text.find(newlines, bol);
if(nextLine == std::string::npos || it.from < nextLine + newlines.size())
break;
bol = nextLine + newlines.size();
nextLine = text.find(newlines, bol);
++lfCount;
}
text::pos_t from(lfCount, it.from - bol);
size_t fromLine = bol;
while(true)
while(nextLine != std::string::npos && nextLine + newlines.size() <= it.to)
{
nextLine = text.find(newlines, bol);
if(nextLine == std::string::npos || it.to < nextLine + newlines.size())
break;
bol = nextLine + newlines.size();
nextLine = text.find(newlines, bol);
++lfCount;
}
text::pos_t to(lfCount, it.to - bol);
size_t eol = bol == it.to ? bol : (nextLine != std::string::npos ? nextLine : text.size());
size_t eol = bol == it.to ? bol : text.find(newlines, bol);
eol = eol != std::string::npos ? eol : text.size();
if(it.from - fromLine > 200)
fromLine = utf8::find_safe_end(text.begin(), text.begin() + it.from - ((it.from - fromLine) % 150)) - text.begin();
if(eol - fromLine > 500)
eol = utf8::find_safe_end(text.begin(), text.begin() + std::max<size_t>(fromLine + 500, it.to)) - text.begin();
match_t res(document, crc32.checksum(), it.from, it.to, text::range_t(from, to), it.captures);
res.excerpt = text.substr(fromLine, eol - fromLine);