‪TYPO3CMS  10.4
CharsetConverterTest.php
Go to the documentation of this file.
1 <?php
2 
3 declare(strict_types=1);
4 
5 /*
6  * This file is part of the TYPO3 CMS project.
7  *
8  * It is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU General Public License, either version 2
10  * of the License, or any later version.
11  *
12  * For the full copyright and license information, please read the
13  * LICENSE.txt file that was distributed with this source code.
14  *
15  * The TYPO3 project - inspiring people to share!
16  */
17 
19 
21 use TYPO3\TestingFramework\Core\Unit\UnitTestCase;
22 
26 class ‪CharsetConverterTest extends UnitTestCase
27 {
32  {
33  $charsetConverter = new ‪CharsetConverter();
34 
35  $string = "\x41"; // A
36  self::assertSame(1, mb_strlen($string));
37  self::assertSame(1, strlen($string));
38  self::assertSame('UTF-8', mb_detect_encoding($string, ['UTF-8', 'ASCII']));
39 
40  // test decoding to ascii
41  self::assertSame('A', $charsetConverter->utf8_decode($string, 'ascii'));
42  self::assertSame('A', $charsetConverter->utf8_decode($string, 'ascii', true));
43 
44  $targetString = $charsetConverter->utf8_decode($string, 'ascii');
45  self::assertSame('ASCII', mb_detect_encoding($targetString, ['ASCII', 'UTF-8']));
46  }
47 
52  {
53  $charsetConverter = new ‪CharsetConverter();
54 
55  $string = "\xE2\x82\xAC"; // €
56  self::assertSame(1, mb_strlen($string));
57  self::assertSame(3, strlen($string));
58  self::assertSame('UTF-8', mb_detect_encoding($string, ['ASCII', 'UTF-8']));
59 
60  // test decoding to ascii
61  self::assertSame('?', $charsetConverter->utf8_decode($string, 'ascii'));
62  self::assertSame('&#x20ac;', $charsetConverter->utf8_decode($string, 'ascii', true));
63 
64  // test decoding to iso-8859-15
65  $targetString = $charsetConverter->utf8_decode($string, 'iso-8859-15');
66  self::assertSame('ISO-8859-15', mb_detect_encoding($targetString, ['ASCII', 'UTF-8', 'ISO-8859-15']));
67  self::assertNotSame($string, $targetString);
68  }
69 
74  {
75  $charsetConverter = new ‪CharsetConverter();
76 
77  $string = "\xE2\x82\xAC"; // €
78  self::assertSame(1, mb_strlen($string));
79  self::assertSame(3, strlen($string));
80  self::assertSame('UTF-8', mb_detect_encoding($string, ['ASCII', 'UTF-8']));
81 
82  // test decoding to ascii
83  self::assertSame('?', $charsetConverter->utf8_decode($string, 'ascii'));
84  self::assertSame('&#x20ac;', $charsetConverter->utf8_decode($string, 'ascii', true));
85 
86  // test decoding to iso-8859-15
87  $targetString = $charsetConverter->utf8_decode($string, 'iso-8859-15');
88  self::assertSame('ISO-8859-15', mb_detect_encoding($targetString, ['ASCII', 'UTF-8', 'ISO-8859-15']));
89  self::assertNotSame($string, $targetString);
90  }
91 
95  public function ‪utf8DecodeAKanjiToBig5()
96  {
97  $charsetConverter = new ‪CharsetConverter();
98 
99  $string = "\xE6\xBC\x80"; // 漀
100  self::assertSame(1, mb_strlen($string));
101  self::assertSame(3, strlen($string));
102  self::assertSame('UTF-8', mb_detect_encoding($string, ['ASCII', 'UTF-8']));
103 
104  // test decoding to ascii
105  self::assertSame('?', $charsetConverter->utf8_decode($string, 'ascii'));
106  self::assertSame('&#x6f00;', $charsetConverter->utf8_decode($string, 'ascii', true));
107 
108  // test decoding to big5
109  $targetString = $charsetConverter->utf8_decode($string, 'big5');
110  self::assertSame('BIG-5', mb_detect_encoding($targetString, ['ASCII', 'UTF-8', 'BIG-5']));
111  self::assertNotSame($string, $targetString);
112  }
113 
118  {
119  $charsetConverter = new ‪CharsetConverter();
120 
121  $string = "\xF0\x9F\x98\x82"; // 😂
122  self::assertSame(1, mb_strlen($string));
123  self::assertSame(4, strlen($string));
124  self::assertSame('UTF-8', mb_detect_encoding($string, ['ASCII', 'UTF-8']));
125 
126  // test decoding to ascii
127  self::assertSame('?', $charsetConverter->utf8_decode($string, 'ascii'));
128  self::assertSame('&#x1f602;', $charsetConverter->utf8_decode($string, 'ascii', true));
129  }
130 
135  {
136  self::assertSame(
137  "\xF0\x9F\x98\x82",
138  (new ‪CharsetConverter())->utf8_decode("\xF0\x9F\x98\x82", 'utf-8')
139  );
140  }
141 
146  {
147  $string = "\x41"; // A
148  $targetString = (new ‪CharsetConverter())->utf8_encode($string, 'iso-8859-15');
149 
150  self::assertSame(1, strlen($string));
151  self::assertSame('A', $targetString);
152  self::assertSame(1, mb_strlen($targetString));
153  self::assertSame(1, strlen($targetString));
154  self::assertSame($string, $targetString);
155  }
156 
161  {
162  $string = "\xA4"; // € sign encoded as iso-8859-15
163  $targetString = (new ‪CharsetConverter())->utf8_encode($string, 'iso-8859-15');
164 
165  self::assertSame('€', $targetString);
166  self::assertSame(1, mb_strlen($targetString));
167  self::assertSame(3, strlen($targetString));
168  self::assertNotSame($string, $targetString);
169  }
170 
175  {
176  $string = "\xA2\xC5"; // 〣 sign encoded as big5
177  $targetString = (new ‪CharsetConverter())->utf8_encode($string, 'big5');
178 
179  self::assertSame(2, strlen($string));
180  self::assertSame('〣', $targetString);
181  self::assertSame(1, mb_strlen($targetString));
182  self::assertSame(3, strlen($targetString));
183  self::assertNotSame($string, $targetString);
184  }
185 
190  {
191  self::assertSame(
192  "\xF0\x9F\x98\x82",
193  (new ‪CharsetConverter())->utf8_encode("\xF0\x9F\x98\x82", 'utf-8')
194  );
195  }
196 
200  public function ‪utf8ToNumberArray()
201  {
202  $string = "\xF0\x9F\x98\x82 &ndash; a joyful emoji";
203  $expectedArray = [
204  '😂',
205  ' ',
206  '–',
207  ' ',
208  'a',
209  ' ',
210  'j',
211  'o',
212  'y',
213  'f',
214  'u',
215  'l',
216  ' ',
217  'e',
218  'm',
219  'o',
220  'j',
221  'i',
222  ];
223 
224  self::assertSame($expectedArray, (new ‪CharsetConverter())->utf8_to_numberarray($string));
225  }
226 
232  public function ‪validInputForSpecCharsToAscii(): array
233  {
234  return [
235  'scandinavian input' => [
236  'Näe ja koe',
237  // See issue #20612 - this is actually a wrong transition, but the way the method currently works
238  'Naee ja koe',
239  ],
240  'german input' => [
241  'Größere Änderungswünsche Weißräm',
242  'Groessere AEnderungswuensche Weissraem',
243  ],
244  ];
245  }
246 
254  string $input,
255  string $expectedString
256  ) {
257  $subject = new ‪CharsetConverter();
258  self::assertSame($expectedString, $subject->specCharsToASCII('utf-8', $input));
259  }
260 
266  public function ‪invalidInputForSpecCharsToAscii(): array
267  {
268  return [
269  'integer input' => [
270  1,
271  ],
272  'null input' => [
273  null,
274  ],
275  'boolean input' => [
276  true,
277  ],
278  'floating point input' => [
279  3.14,
280  ],
281  ];
282  }
283 
290  {
291  $subject = new ‪CharsetConverter();
292  self::assertSame('', $subject->specCharsToASCII('utf-8', $input));
293  }
294 }
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\specCharsToAsciiConvertsUmlautsToAscii
‪specCharsToAsciiConvertsUmlautsToAscii(string $input, string $expectedString)
Definition: CharsetConverterTest.php:253
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\invalidInputForSpecCharsToAscii
‪array[] invalidInputForSpecCharsToAscii()
Definition: CharsetConverterTest.php:266
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\specCharsToAsciiConvertsInvalidInputToEmptyString
‪specCharsToAsciiConvertsInvalidInputToEmptyString($input)
Definition: CharsetConverterTest.php:289
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8EncodeABig5EncodedSign
‪utf8EncodeABig5EncodedSign()
Definition: CharsetConverterTest.php:174
‪TYPO3\CMS\Core\Charset\CharsetConverter
Definition: CharsetConverter.php:54
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8DecodeToUtf8ReturnsTheSameSign
‪utf8DecodeToUtf8ReturnsTheSameSign()
Definition: CharsetConverterTest.php:134
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8EncodeIso885915ACharacter
‪utf8EncodeIso885915ACharacter()
Definition: CharsetConverterTest.php:145
‪TYPO3\CMS\Core\Tests\Unit\Charset
Definition: CharsetConverterTest.php:18
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\validInputForSpecCharsToAscii
‪string[][] validInputForSpecCharsToAscii()
Definition: CharsetConverterTest.php:232
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8DecodeACharacterToIso885915
‪utf8DecodeACharacterToIso885915()
Definition: CharsetConverterTest.php:51
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8ToNumberArray
‪utf8ToNumberArray()
Definition: CharsetConverterTest.php:200
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8DecodeACharacterToAscii
‪utf8DecodeACharacterToAscii()
Definition: CharsetConverterTest.php:31
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8EncodeIso885915EuroSign
‪utf8EncodeIso885915EuroSign()
Definition: CharsetConverterTest.php:160
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest
Definition: CharsetConverterTest.php:27
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\convertingAUtf8EmojiSignToNonExistingAsciiRepresentationResultsInAQuestionMarkSign
‪convertingAUtf8EmojiSignToNonExistingAsciiRepresentationResultsInAQuestionMarkSign()
Definition: CharsetConverterTest.php:117
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8DecodeAKanjiToBig5
‪utf8DecodeAKanjiToBig5()
Definition: CharsetConverterTest.php:95
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8EncodeAlreadyUtf8EncodedSign
‪utf8EncodeAlreadyUtf8EncodedSign()
Definition: CharsetConverterTest.php:189
‪TYPO3\CMS\Core\Tests\Unit\Charset\CharsetConverterTest\utf8DecodeEuroSignCharacterToIso885915
‪utf8DecodeEuroSignCharacterToIso885915()
Definition: CharsetConverterTest.php:73