mirror of
https://github.com/textmate/textmate.git
synced 2026-01-23 05:37:55 -05:00
151 lines
3.5 KiB
C++
151 lines
3.5 KiB
C++
#ifndef ENCODING_H_3OJVUZM1
|
|
#define ENCODING_H_3OJVUZM1
|
|
|
|
#include <oak/misc.h>
|
|
|
|
namespace encoding
|
|
{
|
|
struct PUBLIC classifier_t
|
|
{
|
|
void load (std::string const& path);
|
|
void save (std::string const& path) const;
|
|
|
|
template <typename _InputIter>
|
|
void learn (_InputIter const& first, _InputIter const& last, std::string const& charset)
|
|
{
|
|
auto& r = _charsets[charset];
|
|
each_word(first, last, [&](std::string const& word){
|
|
r.words[word] += 1;
|
|
r.total_words += 1;
|
|
_combined.words[word] += 1;
|
|
_combined.total_words += 1;
|
|
|
|
for(char ch : word)
|
|
{
|
|
if(ch > 0x7F)
|
|
{
|
|
r.bytes[ch] += 1;
|
|
r.total_bytes += 1;
|
|
_combined.bytes[ch] += 1;
|
|
_combined.total_bytes += 1;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
template <typename _InputIter>
|
|
double probability (_InputIter const& first, _InputIter const& last, std::string const& charset) const
|
|
{
|
|
auto record = _charsets.find(charset);
|
|
if(record == _charsets.end())
|
|
return 0;
|
|
|
|
std::set<std::string> seen;
|
|
double a = 1, b = 1;
|
|
|
|
each_word(first, last, [&](std::string const& word){
|
|
auto global = _combined.words.find(word);
|
|
if(global != _combined.words.end() && seen.find(word) == seen.end())
|
|
{
|
|
auto local = record->second.words.find(word);
|
|
if(local != record->second.words.end())
|
|
{
|
|
double pWT = local->second / (double)record->second.total_words;
|
|
double pWF = (global->second - local->second) / (double)_combined.total_words;
|
|
double p = pWT / (pWT + pWF);
|
|
|
|
a *= p;
|
|
b *= 1-p;
|
|
}
|
|
else
|
|
{
|
|
a = 0;
|
|
}
|
|
|
|
seen.insert(word);
|
|
}
|
|
else
|
|
{
|
|
for(char ch : word)
|
|
{
|
|
if(ch > 0x7F)
|
|
{
|
|
auto global = _combined.bytes.find(ch);
|
|
if(global != _combined.bytes.end())
|
|
{
|
|
auto local = record->second.bytes.find(ch);
|
|
if(local != record->second.bytes.end())
|
|
{
|
|
double pWT = local->second / (double)record->second.total_bytes;
|
|
double pWF = (global->second - local->second) / (double)_combined.total_bytes;
|
|
double p = pWT / (pWT + pWF);
|
|
|
|
a *= p;
|
|
b *= 1-p;
|
|
}
|
|
else
|
|
{
|
|
a = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
return (a + b) == 0 ? 0 : a / (a + b);
|
|
}
|
|
|
|
std::vector<std::string> charsets () const;
|
|
|
|
bool operator== (classifier_t const& rhs) const
|
|
{
|
|
return _charsets == rhs._charsets && _combined == rhs._combined;
|
|
}
|
|
|
|
bool operator!= (classifier_t const& rhs) const
|
|
{
|
|
return !(*this == rhs);
|
|
}
|
|
|
|
private:
|
|
void real_load (std::string const& path);
|
|
|
|
template <typename _InputIter, typename _F>
|
|
static void each_word (_InputIter const& first, _InputIter const& last, _F op)
|
|
{
|
|
for(auto eow = first; eow != last; )
|
|
{
|
|
auto bow = std::find_if(eow, last, [](char ch){ return isalpha(ch) || ch > 0x7F; });
|
|
eow = std::find_if(bow, last, [](char ch){ return !isalnum(ch) && ch < 0x80; });
|
|
if(std::find_if(bow, eow, [](char ch){ return ch > 0x7F; }) != eow)
|
|
op(std::string(bow, eow));
|
|
}
|
|
}
|
|
|
|
struct record_t
|
|
{
|
|
bool operator== (record_t const& rhs) const
|
|
{
|
|
return words == rhs.words && bytes == rhs.bytes && total_words == rhs.total_words && total_bytes == rhs.total_bytes;
|
|
}
|
|
|
|
bool operator!= (record_t const& rhs) const
|
|
{
|
|
return !(*this == rhs);
|
|
}
|
|
|
|
std::map<std::string, size_t> words;
|
|
std::map<char, size_t> bytes;
|
|
size_t total_words = 0;
|
|
size_t total_bytes = 0;
|
|
};
|
|
|
|
std::map<std::string, record_t> _charsets;
|
|
record_t _combined;
|
|
};
|
|
|
|
} /* encoding */
|
|
|
|
#endif /* end of include guard: ENCODING_H_3OJVUZM1 */
|