#ifndef ENCODING_H_3OJVUZM1 #define ENCODING_H_3OJVUZM1 #include namespace encoding { struct PUBLIC classifier_t { void load (std::string const& path); void save (std::string const& path) const; template void learn (_InputIter const& first, _InputIter const& last, std::string const& charset) { auto& r = _charsets[charset]; each_word(first, last, [&](std::string const& word){ r.words[word] += 1; r.total_words += 1; _combined.words[word] += 1; _combined.total_words += 1; for(char ch : word) { if(ch > 0x7F) { r.bytes[ch] += 1; r.total_bytes += 1; _combined.bytes[ch] += 1; _combined.total_bytes += 1; } } }); } template double probability (_InputIter const& first, _InputIter const& last, std::string const& charset) const { auto record = _charsets.find(charset); if(record == _charsets.end()) return 0; std::set seen; double a = 1, b = 1; each_word(first, last, [&](std::string const& word){ auto global = _combined.words.find(word); if(global != _combined.words.end() && seen.find(word) == seen.end()) { auto local = record->second.words.find(word); if(local != record->second.words.end()) { double pWT = local->second / (double)record->second.total_words; double pWF = (global->second - local->second) / (double)_combined.total_words; double p = pWT / (pWT + pWF); a *= p; b *= 1-p; } else { a = 0; } seen.insert(word); } else { for(char ch : word) { if(ch > 0x7F) { auto global = _combined.bytes.find(ch); if(global != _combined.bytes.end()) { auto local = record->second.bytes.find(ch); if(local != record->second.bytes.end()) { double pWT = local->second / (double)record->second.total_bytes; double pWF = (global->second - local->second) / (double)_combined.total_bytes; double p = pWT / (pWT + pWF); a *= p; b *= 1-p; } else { a = 0; } } } } } }); return (a + b) == 0 ? 0 : a / (a + b); } std::vector charsets () const; bool operator== (classifier_t const& rhs) const { return _charsets == rhs._charsets && _combined == rhs._combined; } bool operator!= (classifier_t const& rhs) const { return !(*this == rhs); } private: void real_load (std::string const& path); template static void each_word (_InputIter const& first, _InputIter const& last, _F op) { for(auto eow = first; eow != last; ) { auto bow = std::find_if(eow, last, [](char ch){ return isalpha(ch) || ch > 0x7F; }); eow = std::find_if(bow, last, [](char ch){ return !isalnum(ch) && ch < 0x80; }); if(std::find_if(bow, eow, [](char ch){ return ch > 0x7F; }) != eow) op(std::string(bow, eow)); } } struct record_t { bool operator== (record_t const& rhs) const { return words == rhs.words && bytes == rhs.bytes && total_words == rhs.total_words && total_bytes == rhs.total_bytes; } bool operator!= (record_t const& rhs) const { return !(*this == rhs); } std::map words; std::map bytes; size_t total_words = 0; size_t total_bytes = 0; }; std::map _charsets; record_t _combined; }; } /* encoding */ #endif /* end of include guard: ENCODING_H_3OJVUZM1 */