Explorar el Código

:octocat: added Hanzi (GB2312/GB18030) mode according to GBT18284-2000

smiley hace 2 años
padre
commit
99e84efaa3
Se han modificado 4 ficheros con 282 adiciones y 3 borrados
  1. 5 1
      src/Common/Mode.php
  2. 179 0
      src/Data/Hanzi.php
  3. 15 2
      src/QRCode.php
  4. 83 0
      tests/Data/HanziTest.php

+ 5 - 1
src/Common/Mode.php

@@ -10,7 +10,7 @@
 
 
 namespace chillerlan\QRCode\Common;
 namespace chillerlan\QRCode\Common;
 
 
-use chillerlan\QRCode\Data\{AlphaNum, Byte, Kanji, Number};
+use chillerlan\QRCode\Data\{AlphaNum, Byte, Hanzi, Kanji, Number};
 use chillerlan\QRCode\QRCodeException;
 use chillerlan\QRCode\QRCodeException;
 
 
 /**
 /**
@@ -31,6 +31,8 @@ final class Mode{
 	/** @var int */
 	/** @var int */
 	public const KANJI            = 0b1000;
 	public const KANJI            = 0b1000;
 	/** @var int */
 	/** @var int */
+	public const HANZI            = 0b1101;
+	/** @var int */
 	public const STRCTURED_APPEND = 0b0011;
 	public const STRCTURED_APPEND = 0b0011;
 	/** @var int */
 	/** @var int */
 	public const FNC1_FIRST       = 0b0101;
 	public const FNC1_FIRST       = 0b0101;
@@ -49,6 +51,7 @@ final class Mode{
 		self::ALPHANUM => [9, 11, 13],
 		self::ALPHANUM => [9, 11, 13],
 		self::BYTE     => [8, 16, 16],
 		self::BYTE     => [8, 16, 16],
 		self::KANJI    => [8, 10, 12],
 		self::KANJI    => [8, 10, 12],
+		self::HANZI    => [8, 10, 12],
 	];
 	];
 
 
 	/**
 	/**
@@ -60,6 +63,7 @@ final class Mode{
 		self::NUMBER   => Number::class,
 		self::NUMBER   => Number::class,
 		self::ALPHANUM => AlphaNum::class,
 		self::ALPHANUM => AlphaNum::class,
 		self::KANJI    => Kanji::class,
 		self::KANJI    => Kanji::class,
+		self::HANZI    => Hanzi::class,
 		self::BYTE     => Byte::class,
 		self::BYTE     => Byte::class,
 	];
 	];
 
 

+ 179 - 0
src/Data/Hanzi.php

@@ -0,0 +1,179 @@
+<?php
+/**
+ * Class Hanzi
+ *
+ * @created      19.11.2020
+ * @author       smiley <smiley@chillerlan.net>
+ * @copyright    2020 smiley
+ * @license      MIT
+ */
+
+namespace chillerlan\QRCode\Data;
+
+use chillerlan\QRCode\Common\{BitBuffer, Mode};
+
+use function chr, implode, is_string, mb_convert_encoding, mb_detect_encoding,
+	mb_detect_order, mb_internal_encoding, mb_strlen, ord, sprintf, strlen;
+
+/**
+ * Hanzi (simplified Chinese) mode, GBT18284-2000: double-byte characters from the GB2312/GB18030 character set
+ *
+ * Please note that this is not part of the QR Code specification and may not be supported by all readers (ZXing-based ones do).
+ *
+ * @see https://en.wikipedia.org/wiki/GB_2312
+ * @see http://www.herongyang.com/GB2312/Introduction-of-GB2312.html
+ * @see https://en.wikipedia.org/wiki/GBK_(character_encoding)#Encoding
+ * @see https://gist.github.com/codemasher/91da33c44bfb48a81a6c1426bb8e4338
+ * @see https://github.com/zxing/zxing/blob/dfb06fa33b17a9e68321be151c22846c7b78048f/core/src/main/java/com/google/zxing/qrcode/decoder/DecodedBitStreamParser.java#L172-L209
+ * @see https://www.chinesestandard.net/PDF/English.aspx/GBT18284-2000
+ */
+final class Hanzi extends QRDataModeAbstract{
+
+	// GB2312, GB18030
+	public const ENCODING = 'GB18030';
+
+	/**
+	 * @inheritDoc
+	 */
+	protected static int $datamode = Mode::HANZI;
+
+	/**
+	 * @inheritDoc
+	 */
+	protected function getCharCount():int{
+		return mb_strlen($this->data, self::ENCODING);
+	}
+
+	/**
+	 * @inheritDoc
+	 */
+	public function getLengthInBits():int{
+		return $this->getCharCount() * 13;
+	}
+
+	/**
+	 * @inheritDoc
+	 */
+	public static function convertEncoding(string $string):string{
+		mb_detect_order([mb_internal_encoding(), 'UTF-8', 'GB2312', 'GB18030', 'CP936', 'EUC-CN', 'HZ']);
+
+		$detected = mb_detect_encoding($string, null, true);
+
+		if($detected === false){
+			throw new QRCodeDataException('mb_detect_encoding error');
+		}
+
+		if($detected === self::ENCODING){
+			return $string;
+		}
+
+		$string = mb_convert_encoding($string, self::ENCODING, $detected);
+
+		if(!is_string($string)){
+			throw new QRCodeDataException('mb_convert_encoding error');
+		}
+
+		return $string;
+	}
+
+	/**
+	 * checks if a string qualifies as Hanzi/GB2312
+	 */
+	public static function validateString(string $string):bool{
+		$string = self::convertEncoding($string);
+		$len    = strlen($string);
+
+		if($len < 2 || $len % 2 !== 0){
+			return false;
+		}
+
+		for($i = 0; $i < $len; $i += 2){
+			$byte1 = ord($string[$i]);
+			$byte2 = ord($string[$i + 1]);
+
+			// byte 1 unused ranges
+			if($byte1 < 0xa1 || ($byte1 > 0xa9 && $byte1 < 0xb0) || $byte1 > 0xf7){
+				return false;
+			}
+
+			// byte 2 unused ranges
+			if($byte2 < 0xa1 || $byte2 > 0xfe){
+				return false;
+			}
+
+		}
+
+		return true;
+	}
+
+	/**
+	 * @inheritDoc
+	 *
+	 * @throws \chillerlan\QRCode\Data\QRCodeDataException on an illegal character occurence
+	 */
+	public function write(BitBuffer $bitBuffer, int $versionNumber):void{
+
+		$bitBuffer
+			->put($this::$datamode, 4)
+			->put($this->getCharCount(), $this::getLengthBits($versionNumber))
+		;
+
+		$len = strlen($this->data);
+
+		for($i = 0; $i + 1 < $len; $i += 2){
+			$c = ((0xff & ord($this->data[$i])) << 8) | (0xff & ord($this->data[$i + 1]));
+
+			if($c >= 0xa1a1 && $c <= 0xaafe){
+				$c -= 0x0a1a1;
+			}
+			elseif($c >= 0xb0a1 && $c <= 0xfafe){
+				$c -= 0x0a6a1;
+			}
+			else{
+				throw new QRCodeDataException(sprintf('illegal char at %d [%d]', $i + 1, $c));
+			}
+
+			$bitBuffer->put(((($c >> 8) & 0xff) * 0x060) + ($c & 0xff), 13);
+		}
+
+		if($i < $len){
+			throw new QRCodeDataException(sprintf('illegal char at %d', $i + 1));
+		}
+
+	}
+
+	/**
+	 * See specification GBT 18284-2000
+	 *
+	 * @throws \chillerlan\QRCode\Data\QRCodeDataException
+	 */
+	public static function decodeSegment(BitBuffer $bitBuffer, int $versionNumber):string{
+		$length = $bitBuffer->read(self::getLengthBits($versionNumber));
+
+		if($bitBuffer->available() < $length * 13){
+			throw new QRCodeDataException('not enough bits available');
+		}
+
+		// Each character will require 2 bytes. Read the characters as 2-byte pairs and decode as GB2312 afterwards
+		$buffer = [];
+		$offset = 0;
+
+		while($length > 0){
+			// Each 13 bits encodes a 2-byte character
+			$twoBytes          = $bitBuffer->read(13);
+			$assembledTwoBytes = (($twoBytes / 0x060) << 8) | ($twoBytes % 0x060);
+
+			$assembledTwoBytes += ($assembledTwoBytes < 0x00a00) // 0x003BF
+				? 0x0a1a1  // In the 0xA1A1 to 0xAAFE range
+				: 0x0a6a1; // In the 0xB0A1 to 0xFAFE range
+
+			$buffer[$offset]     = chr(0xff & ($assembledTwoBytes >> 8));
+			$buffer[$offset + 1] = chr(0xff & $assembledTwoBytes);
+			$offset              += 2;
+			$length--;
+		}
+
+		return mb_convert_encoding(implode($buffer), mb_internal_encoding(), self::ENCODING);
+	}
+
+}

+ 15 - 2
src/QRCode.php

@@ -11,7 +11,9 @@
 namespace chillerlan\QRCode;
 namespace chillerlan\QRCode;
 
 
 use chillerlan\QRCode\Common\{EccLevel, ECICharset, MaskPattern, Mode, Version};
 use chillerlan\QRCode\Common\{EccLevel, ECICharset, MaskPattern, Mode, Version};
-use chillerlan\QRCode\Data\{AlphaNum, Byte, ECI, Kanji, Number, QRCodeDataException, QRData, QRDataModeInterface, QRMatrix};
+use chillerlan\QRCode\Data\{
+	AlphaNum, Byte, ECI, Hanzi, Kanji, Number, QRCodeDataException, QRData, QRDataModeInterface, QRMatrix
+};
 use chillerlan\QRCode\Decoder\{Decoder, DecoderResult, GDLuminanceSource, IMagickLuminanceSource, LuminanceSourceInterface};
 use chillerlan\QRCode\Decoder\{Decoder, DecoderResult, GDLuminanceSource, IMagickLuminanceSource, LuminanceSourceInterface};
 use chillerlan\QRCode\Output\{QRCodeOutputException, QROutputInterface};
 use chillerlan\QRCode\Output\{QRCodeOutputException, QROutputInterface};
 use chillerlan\Settings\SettingsContainerInterface;
 use chillerlan\Settings\SettingsContainerInterface;
@@ -380,7 +382,7 @@ class QRCode{
 	}
 	}
 
 
 	/**
 	/**
-	 * Adds a Kanji data segment
+	 * Adds a Kanji data segment (Japanese double-byte characters, Shift-JIS)
 	 *
 	 *
 	 * ISO/IEC 18004:2000 8.3.5 - Kanji Mode
 	 * ISO/IEC 18004:2000 8.3.5 - Kanji Mode
 	 */
 	 */
@@ -390,6 +392,17 @@ class QRCode{
 		return $this;
 		return $this;
 	}
 	}
 
 
+	/**
+	 * Adds a Hanzi data segment (simplified Chinese double-byte characters, GB2312/GB18030)
+	 *
+	 * GBT18284-2000 Hanzi Mode
+	 */
+	public function addHanziSegment(string $data):self{
+		$this->addSegment(new Hanzi($data));
+
+		return $this;
+	}
+
 	/**
 	/**
 	 * Adds an 8-bit byte data segment
 	 * Adds an 8-bit byte data segment
 	 *
 	 *

+ 83 - 0
tests/Data/HanziTest.php

@@ -0,0 +1,83 @@
+<?php
+/**
+ * Class HanziTest
+ *
+ * @created      20.11.2021
+ * @author       smiley <smiley@chillerlan.net>
+ * @copyright    2021 smiley
+ * @license      MIT
+ */
+
+namespace chillerlan\QRCodeTest\Data;
+
+use chillerlan\QRCode\Data\Hanzi;
+use Throwable;
+use function bin2hex;
+use function defined;
+use function sprintf;
+
+/**
+ * Tests the Hanzi/GB2312 class
+ */
+class HanziTest extends DataInterfaceTestAbstract{
+
+	protected string $FQN      = Hanzi::class;
+	protected string $testdata = '无可奈何燃花作香';
+
+	/**
+	 * isGB2312() should pass on Hanzi/Hanzi characters and fail on everything else
+	 */
+	public function stringValidateProvider():array{
+		return [
+			['原神', true],
+			['ABC', false],
+			['123', false],
+			['无可奈何燃花作香', true], // https://genshin-impact.fandom.com/wiki/Floral_Incense
+			['꽃잎 향초의 기도', false], // same as above in korean
+		];
+	}
+
+	/**
+	 * lists all characters in the valid GB2312 range
+	 */
+	public function hanziProvider():array{
+		$list = [];
+
+		for($byte1 = 0xa1; $byte1 < 0xf8; $byte1 += 0x1){
+
+			if($byte1 > 0xa9 && $byte1 < 0xb0){
+				continue;
+			}
+
+			for($byte2 = 0xa1; $byte2 < 0xff; $byte2++){
+				$list[] = [chr($byte1).chr($byte2)];
+			}
+
+		}
+
+		return array_map(fn($chr) => mb_convert_encoding($chr, 'UTF-8', Hanzi::ENCODING), $list);
+	}
+
+	/**
+	 * @dataProvider hanziProvider
+	 */
+	public function testValidateGB2312(string $chr):void{
+		// we may run into several issues due to encoding detection failures
+		try{
+			$this::assertTrue(Hanzi::validateString($chr));
+		}
+		catch(Throwable $e){
+			/** @noinspection PhpUndefinedConstantInspection - see phpunit.xml.dist */
+			if(defined('TEST_IS_CI') && TEST_IS_CI === true){
+				$this::markTestSkipped();
+			}
+
+			$this::markTestSkipped(sprintf(
+				'invalid glyph: %s => %s',
+				bin2hex($chr),
+				mb_convert_encoding($chr, Hanzi::ENCODING, mb_internal_encoding())
+			));
+		}
+	}
+
+}