Reload file using most probable charset when it changes on disk

Previously if the file had been loaded as ASCII/UTF-8 and was changed on disk to something other than UTF-8, TextMate would go into infinite loop (retrying with UTF-8). It now uses the encoding classifier to find the most probable encoding (based on learned frequencies) falling back on ISO-8859-1, if there is no candidate.

Ideally it would bring up a warning, informing the user of the problem and allowing a manual correction of character set.

Fixes #1073.
This commit is contained in:
Allan Odgaard
2013-10-05 14:00:48 +02:00
parent b8bccf684b
commit bf09650c0b
2 changed files with 25 additions and 2 deletions

View File

@@ -13,6 +13,7 @@
#include <file/type.h>
#include <file/path_info.h>
#include <plist/ascii.h>
#include <encoding/encoding.h>
#include <selection/selection.h>
#include <OakSystem/application.h>
#include <crash/info.h>
@@ -1004,7 +1005,28 @@ namespace document
{
open_callback_t (document::document_ptr doc, bool async) : _document(doc), _wait(!async) { }
void select_charset (std::string const& path, io::bytes_ptr content, file::open_context_ptr context) { context->set_charset(_document->_disk_encoding); }
void select_charset (std::string const& path, io::bytes_ptr content, file::open_context_ptr context)
{
if(_try_disk_encoding)
{
_try_disk_encoding = false;
context->set_charset(_document->_disk_encoding);
}
else
{
encoding::classifier_t db;
static std::string const kEncodingFrequenciesPath = path::join(path::home(), "Library/Caches/com.macromates.TextMate/EncodingFrequencies.binary");
db.load(kEncodingFrequenciesPath);
std::multimap<double, std::string> probabilities;
for(auto const& charset : db.charsets())
probabilities.emplace(1 - db.probability(content->begin(), content->end(), charset), charset);
if(!probabilities.empty() && probabilities.begin()->first < 1)
context->set_charset(probabilities.begin()->second);
else context->set_charset("ISO-8859-1");
}
}
void select_line_feeds (std::string const& path, io::bytes_ptr content, file::open_context_ptr context) { context->set_line_feeds(_document->_disk_newlines); }
void select_file_type (std::string const& path, io::bytes_ptr content, file::open_context_ptr context) { context->set_file_type(_document->_file_type); }
void show_error (std::string const& path, std::string const& message, oak::uuid_t const& filter) { fprintf(stderr, "%s: %s\n", path.c_str(), message.c_str()); }
@@ -1057,6 +1079,7 @@ namespace document
private:
document::document_ptr _document;
bool _try_disk_encoding = true;
bool _wait;
cf::run_loop_t _run_loop;
};

View File

@@ -1,5 +1,5 @@
TEST_SOURCES = tests/*.cc
SOURCES = src/*.cc
EXPORT = src/document.h src/directory.h src/collection.h src/reader.h
LINK += file text buffer selection bundles command cf io file scm plist OakSystem regexp settings authorization undo crash
LINK += file text buffer selection bundles command cf io file scm plist OakSystem regexp settings authorization undo crash encoding
FRAMEWORKS = ApplicationServices