mirror of
https://github.com/textmate/textmate.git
synced 2026-04-28 03:00:34 -04:00
Add UTF-8 sanitization function
This can be used to remove malformed multibyte sequences.
This commit is contained in:
@@ -198,6 +198,45 @@ namespace utf8
|
||||
return (it + multibyte<T>::length(*it) > last) ? it : last;
|
||||
}
|
||||
|
||||
template <typename _Iter>
|
||||
_Iter remove_malformed (_Iter it, _Iter const& last)
|
||||
{
|
||||
auto dst = it;
|
||||
for(; it != last; ++it)
|
||||
{
|
||||
bool valid = true;
|
||||
size_t len = 0;
|
||||
|
||||
char ch = *it;
|
||||
if((ch & 0x80) == 0x00)
|
||||
len = 0;
|
||||
else if((ch & 0xE0) == 0xC0)
|
||||
len = 1;
|
||||
else if((ch & 0xF0) == 0xE0)
|
||||
len = 2;
|
||||
else if((ch & 0xF8) == 0xF0)
|
||||
len = 3;
|
||||
else if((ch & 0xFC) == 0xF8)
|
||||
len = 4;
|
||||
else if((ch & 0xFE) == 0xFC)
|
||||
len = 5;
|
||||
else
|
||||
valid = false;
|
||||
|
||||
auto bt = it;
|
||||
for(size_t i = 0; i < len && valid; ++i)
|
||||
valid = ++it != last && (*it & 0xC0) == 0x80;
|
||||
|
||||
if(!valid)
|
||||
it = bt;
|
||||
else if(dst == bt)
|
||||
std::advance(dst, len+1);
|
||||
else
|
||||
dst = std::copy_n(bt, len+1, dst);
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
|
||||
} /* utf8 */
|
||||
|
||||
namespace diacritics
|
||||
|
||||
@@ -69,3 +69,19 @@ void test_to_s ()
|
||||
str += utf8::to_s(*ch);
|
||||
OAK_ASSERT_EQ(str, "“Æblegrød…” — 𠻵");
|
||||
}
|
||||
|
||||
static std::string sanitize (std::string str)
|
||||
{
|
||||
str.erase(utf8::remove_malformed(str.begin(), str.end()), str.end());
|
||||
return str;
|
||||
}
|
||||
|
||||
void test_sanitize ()
|
||||
{
|
||||
OAK_ASSERT_EQ("Æblegrød", sanitize("Æblegrød"));
|
||||
OAK_ASSERT_EQ("Æblegrød", sanitize("Æb\xFFlegrød"));
|
||||
OAK_ASSERT_EQ("Æblegrød", sanitize("Æb\xC0legrød"));
|
||||
OAK_ASSERT_EQ("Æblegrød", sanitize("Æb\xC0\xFElegrød"));
|
||||
OAK_ASSERT_EQ("Æblegrød", sanitize("Æb\xFE\xC0legrød"));
|
||||
OAK_ASSERT_EQ("Æblegrød", sanitize("Æblegrød\xFE"));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user