From 2da98f6ff16ccc461113b1698cc4252230f9dd0b Mon Sep 17 00:00:00 2001 From: SimonHeimberg Date: Mon, 5 Feb 2018 22:36:46 +0100 Subject: [PATCH 1/2] replaceIllegalCharacters replaces too long byte sequences MySql only supports utf8 characters encoded shorter than four bytes. --- src/PHPCensor/Helper/CommandExecutor.php | 9 ++++----- tests/PHPCensor/Helper/CommandExecutorTest.php | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/PHPCensor/Helper/CommandExecutor.php b/src/PHPCensor/Helper/CommandExecutor.php index 77e05069..31ccb6b9 100644 --- a/src/PHPCensor/Helper/CommandExecutor.php +++ b/src/PHPCensor/Helper/CommandExecutor.php @@ -176,13 +176,12 @@ class CommandExecutor implements CommandExecutorInterface */ public function replaceIllegalCharacters($utf8String) { + mb_substitute_character(0xFFFD); // is '�' + $legalUtf8String = mb_convert_encoding($utf8String, 'utf8', 'utf8'); $regexp = '/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]' . - '|[\x00-\x7F][\x80-\xBF]+' . - '|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*' . - '|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})' . - '|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S'; + '|[^\x{0}-\x{ffff}]/u'; // more than 3 byte UTF-8 sequences (unsupported in mysql) - return preg_replace($regexp, '�', $utf8String); + return preg_replace($regexp, '�', $legalUtf8String); } /** diff --git a/tests/PHPCensor/Helper/CommandExecutorTest.php b/tests/PHPCensor/Helper/CommandExecutorTest.php index 831d4599..af130aa2 100644 --- a/tests/PHPCensor/Helper/CommandExecutorTest.php +++ b/tests/PHPCensor/Helper/CommandExecutorTest.php @@ -100,7 +100,7 @@ EOD; ); $this->assertEquals( - \Normalizer::normalize("start 123_X08�_X00�_Xa�_5432 end"), + \Normalizer::normalize("start 123_X08�_X00�_Xa4�_5432 end"), \Normalizer::normalize($this->testedExecutor->replaceIllegalCharacters( "start 123_X08\x08_X00\x00_Xa4\xa4_5432 end" )) From 2d67afc4c7006f422716ba03f6362b6388f62e87 Mon Sep 17 00:00:00 2001 From: SimonHeimberg Date: Wed, 7 Feb 2018 10:57:47 +0100 Subject: [PATCH 2/2] Testing replaceIllegalCharacters without doing normalisation We do not use any compound characters, so normalisation is a no op. --- tests/PHPCensor/Helper/CommandExecutorTest.php | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/PHPCensor/Helper/CommandExecutorTest.php b/tests/PHPCensor/Helper/CommandExecutorTest.php index af130aa2..e5dea453 100644 --- a/tests/PHPCensor/Helper/CommandExecutorTest.php +++ b/tests/PHPCensor/Helper/CommandExecutorTest.php @@ -86,24 +86,24 @@ EOD; public function testReplaceIllegalCharacters() { $this->assertEquals( - \Normalizer::normalize("start � end"), - \Normalizer::normalize($this->testedExecutor->replaceIllegalCharacters( + "start � end", + $this->testedExecutor->replaceIllegalCharacters( "start \xf0\x9c\x83\x96 end" - )) + ) ); $this->assertEquals( - \Normalizer::normalize("start � end"), - \Normalizer::normalize($this->testedExecutor->replaceIllegalCharacters( + "start � end", + $this->testedExecutor->replaceIllegalCharacters( "start \xF0\x9C\x83\x96 end" - )) + ) ); $this->assertEquals( - \Normalizer::normalize("start 123_X08�_X00�_Xa4�_5432 end"), - \Normalizer::normalize($this->testedExecutor->replaceIllegalCharacters( + "start 123_X08�_X00�_Xa4�_5432 end", + $this->testedExecutor->replaceIllegalCharacters( "start 123_X08\x08_X00\x00_Xa4\xa4_5432 end" - )) + ) ); } }