Просмотр исходного кода

:octocat: improved Kanji validation (fingers crossed)

smiley 2 лет назад
Родитель
Сommit
4da90c5c3b
3 измененных файлов с 150 добавлено и 23 удалено
  1. 69 21
      src/Data/Kanji.php
  2. 10 0
      src/Data/QRDataModeAbstract.php
  3. 71 2
      tests/Data/KanjiTest.php

+ 69 - 21
src/Data/Kanji.php

@@ -12,66 +12,113 @@ namespace chillerlan\QRCode\Data;
 
 use chillerlan\QRCode\Common\{BitBuffer, Mode};
 
-use function chr, implode, mb_convert_encoding, mb_detect_encoding, mb_internal_encoding, mb_strlen, ord, sprintf, strlen;
+use function chr, implode, is_string, mb_convert_encoding, mb_detect_encoding,
+	mb_detect_order, mb_internal_encoding, mb_strlen, ord, sprintf, strlen;
 
 /**
- * Kanji mode: double-byte characters from the Shift JIS character set
+ * Kanji mode: double-byte characters from the Shift-JIS character set
  *
  * ISO/IEC 18004:2000 Section 8.3.5
  * ISO/IEC 18004:2000 Section 8.4.5
+ *
+ * @see https://en.wikipedia.org/wiki/Shift_JIS#As_defined_in_JIS_X_0208:1997
+ * @see http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
+ * @see https://gist.github.com/codemasher/d07d3e6e9346c08e7a41b8b978784952
  */
 final class Kanji extends QRDataModeAbstract{
 
+	// SJIS, SJIS-2004
+	// SJIS-2004 may produce errors in PHP < 8
+	public const sjisEncoding = 'SJIS';
+
 	/**
 	 * @inheritDoc
 	 */
 	protected static int $datamode = Mode::KANJI;
 
 	/**
-	 *
+	 * @inheritDoc
 	 */
-	public function __construct(string $data){
-		parent::__construct($data);
-
-		$this->data = mb_convert_encoding($this->data, 'SJIS', mb_detect_encoding($this->data));
+	protected function getCharCount():int{
+		return mb_strlen($this->data, self::sjisEncoding);
 	}
 
 	/**
 	 * @inheritDoc
 	 */
-	protected function getCharCount():int{
-		return mb_strlen($this->data, 'SJIS');
+	public function getLengthInBits():int{
+		return $this->getCharCount() * 13;
 	}
 
 	/**
 	 * @inheritDoc
 	 */
-	public function getLengthInBits():int{
-		return $this->getCharCount() * 13;
+	public static function convertEncoding(string $string):string{
+		mb_detect_order(['ASCII', mb_internal_encoding(), 'UTF-8', 'SJIS', 'SJIS-2004']);
+
+		$detected = mb_detect_encoding($string, null, true);
+
+		if($detected === false){
+			throw new QRCodeDataException('mb_detect_encoding error');
+		}
+
+		if($detected === self::sjisEncoding){
+			return $string;
+		}
+
+		$string = mb_convert_encoding($string, self::sjisEncoding, $detected);
+
+		if(!is_string($string)){
+			throw new QRCodeDataException(sprintf('invalid encoding: %s', $detected));
+		}
+
+		return $string;
 	}
 
 	/**
-	 * checks if a string qualifies as Kanji
+	 * checks if a string qualifies as SJIS Kanji
 	 */
 	public static function validateString(string $string):bool{
-		$i   = 0;
-		$len = strlen($string);
+		$string = self::convertEncoding($string);
+		$len    = strlen($string);
 
-		if($len < 2){
+		if($len < 2 || $len % 2 !== 0){
 			return false;
 		}
 
-		while($i + 1 < $len){
-			$c = ((0xff & ord($string[$i])) << 8) | (0xff & ord($string[$i + 1]));
+		for($i = 0; $i < $len; $i += 2){
+			$byte1 = ord($string[$i]);
+			$byte2 = ord($string[$i + 1]);
+
+			// byte 1 unused and vendor ranges
+			if($byte1 < 0x81 || ($byte1 > 0x84 && $byte1 < 0x88) || ($byte1 > 0x9f && $byte1 < 0xe0) ||  $byte1 > 0xea){
+				return false;
+			}
 
-			if(!($c >= 0x8140 && $c <= 0x9ffc) && !($c >= 0xe040 && $c <= 0xebbf)){
+			// byte 2 unused ranges
+			if($byte2 < 0x40 || $byte2 === 0x7f || $byte2 > 0xfc){
 				return false;
 			}
 
-			$i += 2;
+			// byte 1 is even, second byte in range 0x9f - 0xfc
+			if(($byte1 % 2) === 0){
+				if($byte2 < 0x9f){
+					return false;
+				}
+			}
+			// byte 1 is odd, second byte in range 0x40 - 0x9e (technically)
+			// now this is weird: according to spec, the second byte should be lower than 0x9e.
+			// however, converting encodings back and forth seems to mess with the string somehow.
+			// someone please riddle me this
+#			else{
+#				if($byte2 > 0x9e){
+#					return false;
+#				}
+#			}
+
 		}
 
-		return $i >= $len;
+		return true;
 	}
 
 	/**
@@ -122,6 +169,7 @@ final class Kanji extends QRDataModeAbstract{
 			throw new QRCodeDataException('not enough bits available');  // @codeCoverageIgnore
 		}
 
+		// Each character will require 2 bytes. Read the characters as 2-byte pairs and decode as SJIS afterwards
 		$buffer = [];
 		$offset = 0;
 
@@ -140,7 +188,7 @@ final class Kanji extends QRDataModeAbstract{
 			$length--;
 		}
 
-		return mb_convert_encoding(implode($buffer), mb_internal_encoding(), 'SJIS');
+		return mb_convert_encoding(implode($buffer), mb_internal_encoding(), self::sjisEncoding);
 	}
 
 }

+ 10 - 0
src/Data/QRDataModeAbstract.php

@@ -32,6 +32,7 @@ abstract class QRDataModeAbstract implements QRDataModeInterface{
 	 * @throws \chillerlan\QRCode\Data\QRCodeDataException
 	 */
 	public function __construct(string $data){
+		$data = $this::convertEncoding($data);
 
 		if(!$this::validateString($data)){
 			throw new QRCodeDataException('invalid data');
@@ -61,4 +62,13 @@ abstract class QRDataModeAbstract implements QRDataModeInterface{
 		return Mode::getLengthBitsForVersion(static::$datamode, $versionNumber);
 	}
 
+	/**
+	 * encoding conversion helper
+	 *
+	 * @throws \chillerlan\QRCode\Data\QRCodeDataException
+	 */
+	public static function convertEncoding(string $string):string{
+		return $string;
+	}
+
 }

+ 71 - 2
tests/Data/KanjiTest.php

@@ -11,6 +11,8 @@
 namespace chillerlan\QRCodeTest\Data;
 
 use chillerlan\QRCode\Data\Kanji;
+use Throwable;
+use function array_map, bin2hex, chr, defined, mb_internal_encoding, sprintf;
 
 /**
  * Tests the Kanji class
@@ -18,7 +20,7 @@ use chillerlan\QRCode\Data\Kanji;
 final class KanjiTest extends DataInterfaceTestAbstract{
 
 	protected string $FQN      = Kanji::class;
-	protected string $testdata = '茗荷茗荷茗荷茗荷茗荷';
+	protected string $testdata = '漂う花の香り';
 
 	/**
 	 * isKanji() should pass on Kanji/SJIS characters and fail on everything else
@@ -26,10 +28,77 @@ final class KanjiTest extends DataInterfaceTestAbstract{
 	public function stringValidateProvider():array{
 		return [
 			['茗荷', true],
-			['Ã', false],
+			['Ã', false], // this will fail in SJIS-2004
 			['ABC', false],
 			['123', false],
+			['漂う花の香り', true], // https://genshin-impact.fandom.com/wiki/Floral_Incense
+			['꽃잎 향초의 기도', false], // same as above in korean
 		];
 	}
 
+	/**
+	 * lists the valid SJIS kanj
+	 */
+	public function kanjiProvider():array{
+		$list = [];
+
+		for($byte1 = 0x81; $byte1 < 0xeb; $byte1 += 0x1){
+
+			// skip invalid/vendor ranges
+			if(($byte1 > 0x84 && $byte1 < 0x88) || ($byte1 > 0x9f && $byte1 < 0xe0)){
+				continue;
+			}
+
+			// second byte of a double-byte JIS X 0208 character whose first half of the JIS sequence was odd
+			if(($byte1 % 2) !== 0){
+
+				for($byte2 = 0x40; $byte2 < 0x9f; $byte2++){
+
+					if($byte2 === 0x7f){
+						continue;
+					}
+
+					$list[] = [chr($byte1).chr($byte2)];
+				}
+
+			}
+			// second byte if the first half of the JIS sequence was even
+			else{
+
+				for($byte2 = 0x9f; $byte2 < 0xfd; $byte2++){
+					$list[] = [chr($byte1).chr($byte2)];
+				}
+
+			}
+
+		}
+
+		// we need to put the joined byte sequence in a proper encoding
+		return array_map(fn($chr) => mb_convert_encoding($chr, Kanji::sjisEncoding, Kanji::sjisEncoding), $list);
+	}
+
+	/**
+	 * @dataProvider kanjiProvider
+	 */
+	public function testValidateSJIS(string $kanji):void{
+
+		// we may run into several issues due to encoding detection failures
+		try{
+			$this::assertTrue(Kanji::validateString($kanji));
+		}
+		catch(Throwable $e){
+
+			/** @noinspection PhpUndefinedConstantInspection - see phpunit.xml.dist */
+			if(defined('TEST_IS_CI') && TEST_IS_CI === true){
+				$this::markTestSkipped();
+			}
+
+			$this::markTestSkipped(sprintf(
+				'invalid glyph: %s => %s',
+				bin2hex($kanji),
+				mb_convert_encoding($kanji, Kanji::sjisEncoding, mb_internal_encoding())
+			));
+		}
+	}
+
 }