Add UTF-8 sanitization function

This can be used to remove malformed multibyte sequences.
This commit is contained in:
Allan Odgaard
2013-10-07 18:34:40 +02:00
parent 51a7424c90
commit 2fa5d7ddb2
2 changed files with 55 additions and 0 deletions

View File

@@ -198,6 +198,45 @@ namespace utf8
return (it + multibyte<T>::length(*it) > last) ? it : last;
}
template <typename _Iter>
_Iter remove_malformed (_Iter it, _Iter const& last)
{
auto dst = it;
for(; it != last; ++it)
{
bool valid = true;
size_t len = 0;
char ch = *it;
if((ch & 0x80) == 0x00)
len = 0;
else if((ch & 0xE0) == 0xC0)
len = 1;
else if((ch & 0xF0) == 0xE0)
len = 2;
else if((ch & 0xF8) == 0xF0)
len = 3;
else if((ch & 0xFC) == 0xF8)
len = 4;
else if((ch & 0xFE) == 0xFC)
len = 5;
else
valid = false;
auto bt = it;
for(size_t i = 0; i < len && valid; ++i)
valid = ++it != last && (*it & 0xC0) == 0x80;
if(!valid)
it = bt;
else if(dst == bt)
std::advance(dst, len+1);
else
dst = std::copy_n(bt, len+1, dst);
}
return dst;
}
} /* utf8 */
namespace diacritics

View File

@@ -69,3 +69,19 @@ void test_to_s ()
str += utf8::to_s(*ch);
OAK_ASSERT_EQ(str, "“Æblegrød…” — 𠻵");
}
static std::string sanitize (std::string str)
{
str.erase(utf8::remove_malformed(str.begin(), str.end()), str.end());
return str;
}
void test_sanitize ()
{
OAK_ASSERT_EQ("Æblegrød", sanitize("Æblegrød"));
OAK_ASSERT_EQ("Æblegrød", sanitize("Æb\xFFlegrød"));
OAK_ASSERT_EQ("Æblegrød", sanitize("Æb\xC0legrød"));
OAK_ASSERT_EQ("Æblegrød", sanitize("Æb\xC0\xFElegrød"));
OAK_ASSERT_EQ("Æblegrød", sanitize("Æb\xFE\xC0legrød"));
OAK_ASSERT_EQ("Æblegrød", sanitize("Æblegrød\xFE"));
}