replaceIllegalCharacters replaces too long byte sequences

MySql only supports utf8 characters encoded shorter than four bytes.
This commit is contained in:
SimonHeimberg 2018-02-05 22:36:46 +01:00
parent 58798fb2f9
commit 2da98f6ff1
2 changed files with 5 additions and 6 deletions

View file

@ -176,13 +176,12 @@ class CommandExecutor implements CommandExecutorInterface
*/
public function replaceIllegalCharacters($utf8String)
{
mb_substitute_character(0xFFFD); // is '<27>'
$legalUtf8String = mb_convert_encoding($utf8String, 'utf8', 'utf8');
$regexp = '/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]' .
'|[\x00-\x7F][\x80-\xBF]+' .
'|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*' .
'|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})' .
'|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S';
'|[^\x{0}-\x{ffff}]/u'; // more than 3 byte UTF-8 sequences (unsupported in mysql)
return preg_replace($regexp, '<27>', $utf8String);
return preg_replace($regexp, '<27>', $legalUtf8String);
}
/**

View file

@ -100,7 +100,7 @@ EOD;
);
$this->assertEquals(
\Normalizer::normalize("start 123_X08<30>_X00<30>_Xa<EFBFBD>_5432 end"),
\Normalizer::normalize("start 123_X08<30>_X00<30>_Xa4<EFBFBD>_5432 end"),
\Normalizer::normalize($this->testedExecutor->replaceIllegalCharacters(
"start 123_X08\x08_X00\x00_Xa4\xa4_5432 end"
))