diff --git a/src/PHPCensor/Helper/CommandExecutor.php b/src/PHPCensor/Helper/CommandExecutor.php index 77e05069..31ccb6b9 100644 --- a/src/PHPCensor/Helper/CommandExecutor.php +++ b/src/PHPCensor/Helper/CommandExecutor.php @@ -176,13 +176,12 @@ class CommandExecutor implements CommandExecutorInterface */ public function replaceIllegalCharacters($utf8String) { + mb_substitute_character(0xFFFD); // is '�' + $legalUtf8String = mb_convert_encoding($utf8String, 'utf8', 'utf8'); $regexp = '/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]' . - '|[\x00-\x7F][\x80-\xBF]+' . - '|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*' . - '|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})' . - '|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S'; + '|[^\x{0}-\x{ffff}]/u'; // more than 3 byte UTF-8 sequences (unsupported in mysql) - return preg_replace($regexp, '�', $utf8String); + return preg_replace($regexp, '�', $legalUtf8String); } /** diff --git a/tests/PHPCensor/Helper/CommandExecutorTest.php b/tests/PHPCensor/Helper/CommandExecutorTest.php index 831d4599..af130aa2 100644 --- a/tests/PHPCensor/Helper/CommandExecutorTest.php +++ b/tests/PHPCensor/Helper/CommandExecutorTest.php @@ -100,7 +100,7 @@ EOD; ); $this->assertEquals( - \Normalizer::normalize("start 123_X08�_X00�_Xa�_5432 end"), + \Normalizer::normalize("start 123_X08�_X00�_Xa4�_5432 end"), \Normalizer::normalize($this->testedExecutor->replaceIllegalCharacters( "start 123_X08\x08_X00\x00_Xa4\xa4_5432 end" ))