‪TYPO3CMS  9.5
CharsetConverter.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
22 
54 {
56 
61  protected ‪$deprecatedPublicProperties = [
62  'noCharByteVal' => 'Using $noCharByteVal of class CharsetConverter from the outside is discouraged, as this only reflects a fixed constant.',
63  'parsedCharsets' => 'Using $parsedCharsets of class CharsetConverter from the outside is discouraged, as this only reflects a local runtime cache.',
64  'toASCII' => 'Using $toASCII of class CharsetConverter from the outside is discouraged, as this only reflects a local runtime cache.',
65  'twoByteSets' => 'Using $twoByteSets of class CharsetConverter from the outside is discouraged.',
66  'eucBasedSets' => 'Using $eucBasedSets of class CharsetConverter from the outside is discouraged.',
67  'synonyms' => 'Using $synonyms of class CharsetConverter from the outside is discouraged, as this functionality will be removed in TYPO3 v10.0.',
68  ];
69 
75  protected ‪$noCharByteVal = 63;
76 
82  protected ‪$parsedCharsets = [];
83 
89  protected ‪$toASCII = [];
90 
96  protected ‪$twoByteSets = [
97  'ucs-2' => 1
98  ];
99 
105  protected ‪$eucBasedSets = [
106  'gb2312' => 1, // Chinese, simplified.
107  'big5' => 1, // Chinese, traditional.
108  'euc-kr' => 1, // Korean
109  'shift_jis' => 1
110  ];
111 
119  protected ‪$synonyms = [
120  'us' => 'ascii',
121  'us-ascii' => 'ascii',
122  'cp819' => 'iso-8859-1',
123  'ibm819' => 'iso-8859-1',
124  'iso-ir-100' => 'iso-8859-1',
125  'iso-ir-101' => 'iso-8859-2',
126  'iso-ir-109' => 'iso-8859-3',
127  'iso-ir-110' => 'iso-8859-4',
128  'iso-ir-144' => 'iso-8859-5',
129  'iso-ir-127' => 'iso-8859-6',
130  'iso-ir-126' => 'iso-8859-7',
131  'iso-ir-138' => 'iso-8859-8',
132  'iso-ir-148' => 'iso-8859-9',
133  'iso-ir-157' => 'iso-8859-10',
134  'iso-ir-179' => 'iso-8859-13',
135  'iso-ir-199' => 'iso-8859-14',
136  'iso-ir-203' => 'iso-8859-15',
137  'csisolatin1' => 'iso-8859-1',
138  'csisolatin2' => 'iso-8859-2',
139  'csisolatin3' => 'iso-8859-3',
140  'csisolatin5' => 'iso-8859-9',
141  'csisolatin8' => 'iso-8859-14',
142  'csisolatin9' => 'iso-8859-15',
143  'csisolatingreek' => 'iso-8859-7',
144  'iso-celtic' => 'iso-8859-14',
145  'latin1' => 'iso-8859-1',
146  'latin2' => 'iso-8859-2',
147  'latin3' => 'iso-8859-3',
148  'latin5' => 'iso-8859-9',
149  'latin6' => 'iso-8859-10',
150  'latin8' => 'iso-8859-14',
151  'latin9' => 'iso-8859-15',
152  'l1' => 'iso-8859-1',
153  'l2' => 'iso-8859-2',
154  'l3' => 'iso-8859-3',
155  'l5' => 'iso-8859-9',
156  'l6' => 'iso-8859-10',
157  'l8' => 'iso-8859-14',
158  'l9' => 'iso-8859-15',
159  'cyrillic' => 'iso-8859-5',
160  'arabic' => 'iso-8859-6',
161  'tis-620' => 'iso-8859-11',
162  'win874' => 'windows-874',
163  'win1250' => 'windows-1250',
164  'win1251' => 'windows-1251',
165  'win1252' => 'windows-1252',
166  'win1253' => 'windows-1253',
167  'win1254' => 'windows-1254',
168  'win1255' => 'windows-1255',
169  'win1256' => 'windows-1256',
170  'win1257' => 'windows-1257',
171  'win1258' => 'windows-1258',
172  'cp1250' => 'windows-1250',
173  'cp1251' => 'windows-1251',
174  'cp1252' => 'windows-1252',
175  'ms-ee' => 'windows-1250',
176  'ms-ansi' => 'windows-1252',
177  'ms-greek' => 'windows-1253',
178  'ms-turk' => 'windows-1254',
179  'winbaltrim' => 'windows-1257',
180  'koi-8ru' => 'koi-8r',
181  'koi8r' => 'koi-8r',
182  'cp878' => 'koi-8r',
183  'mac' => 'macroman',
184  'macintosh' => 'macroman',
185  'euc-cn' => 'gb2312',
186  'x-euc-cn' => 'gb2312',
187  'euccn' => 'gb2312',
188  'cp936' => 'gb2312',
189  'big-5' => 'big5',
190  'cp950' => 'big5',
191  'eucjp' => 'euc-jp',
192  'sjis' => 'shift_jis',
193  'shift-jis' => 'shift_jis',
194  'cp932' => 'shift_jis',
195  'cp949' => 'euc-kr',
196  'utf7' => 'utf-7',
197  'utf8' => 'utf-8',
198  'utf16' => 'utf-16',
199  'utf32' => 'utf-32',
200  'ucs2' => 'ucs-2',
201  'ucs4' => 'ucs-4'
202  ];
203 
211  public function ‪parse_charset($charset)
212  {
213  trigger_error('Method CharsetConverter->parse_charset() will be removed in TYPO3 v10.0. Use native mbstring functions directly.', E_USER_DEPRECATED);
214  $charset = trim(strtolower($charset));
215  if (isset($this->synonyms[$charset])) {
216  $charset = $this->synonyms[$charset];
217  }
218  return $charset;
219  }
220 
221  /********************************************
222  *
223  * Charset Conversion functions
224  *
225  ********************************************/
236  public function ‪conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = null)
237  {
238  if ($fromCharset === $toCharset) {
239  return $inputString;
240  }
241  if ($useEntityForNoChar === null) {
242  $useEntityForNoChar = false;
243  }
244  if (!$useEntityForNoChar) {
245  trigger_error('Calling CharsetConverter->conv() without the necessity to convert the entities for unavailable characters is discouraged, and will not be possible via conv() anymore in TYPO3 v10.0. Use native mb_convert_encoding() directly, or set the 4th parameter of conv() to true.', E_USER_DEPRECATED);
246  }
247 
248  // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
249  if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
250  // Returns FALSE for unsupported charsets
251  $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
252  if (false !== $convertedString) {
253  return $convertedString;
254  }
255  }
256  if ($fromCharset !== 'utf-8') {
257  $inputString = $this->‪utf8_encode($inputString, $fromCharset);
258  }
259  if ($toCharset !== 'utf-8') {
260  $inputString = $this->‪utf8_decode($inputString, $toCharset, $useEntityForNoChar);
261  }
262  return $inputString;
263  }
264 
276  public function ‪convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
277  {
278  trigger_error('Method CharsetConverter->convArray() will be removed in TYPO3 v10.0. Use conv() directly and loop over the array in the callers code.', E_USER_DEPRECATED);
279  foreach ($array as $key => $value) {
280  if (is_array($array[$key])) {
281  $this->‪convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
282  } elseif (is_string($array[$key])) {
283  $array[$key] = $this->‪conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
284  }
285  }
286  }
287 
295  public function ‪utf8_encode($str, $charset)
296  {
297  if ($charset === 'utf-8') {
298  return $str;
299  }
300  // Charset is case-insensitive
301  // Parse conv. table if not already
302  if ($this->‪initCharset($charset)) {
303  $strLen = strlen($str);
304  $outStr = '';
305  // Traverse each char in string
306  for ($a = 0; $a < $strLen; $a++) {
307  $chr = substr($str, $a, 1);
308  $ord = ord($chr);
309  // If the charset has two bytes per char
310  if (isset($this->twoByteSets[$charset])) {
311  $ord2 = ord($str[$a + 1]);
312  // Assume big endian
313  $ord = $ord << 8 | $ord2;
314  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
315  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
316  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
317  } else {
318  $outStr .= chr($this->noCharByteVal);
319  }
320  // No char exists
321  $a++;
322  } elseif ($ord > 127) {
323  // If char has value over 127 it's a multibyte char in UTF-8
324  // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
325  if (isset($this->eucBasedSets[$charset])) {
326  // Shift-JIS: chars between 160 and 223 are single byte
327  if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
328  $a++;
329  $ord2 = ord(substr($str, $a, 1));
330  $ord = $ord * 256 + $ord2;
331  }
332  }
333  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
334  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
335  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
336  } else {
337  $outStr .= chr($this->noCharByteVal);
338  }
339  } else {
340  $outStr .= $chr;
341  }
342  }
343  return $outStr;
344  }
345  return '';
346  }
347 
356  public function ‪utf8_decode($str, $charset, $useEntityForNoChar = false)
357  {
358  if ($charset === 'utf-8') {
359  return $str;
360  }
361  // Charset is case-insensitive.
362  // Parse conv. table if not already
363  if ($this->‪initCharset($charset)) {
364  $strLen = strlen($str);
365  $outStr = '';
366  // Traverse each char in UTF-8 string
367  for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
368  $chr = substr($str, $a, 1);
369  $ord = ord($chr);
370  // This means multibyte! (first byte!)
371  if ($ord > 127) {
372  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
373  if ($ord & 64) {
374  // Add first byte
375  $buf = $chr;
376  // For each byte in multibyte string
377  for ($b = 0; $b < 8; $b++) {
378  // Shift it left and
379  $ord = $ord << 1;
380  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
381  if ($ord & 128) {
382  $a++;
383  // ... and add the next char.
384  $buf .= substr($str, $a, 1);
385  } else {
386  break;
387  }
388  }
389  // If the UTF-8 char-sequence is found then...
390  if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
391  // The local number
392  $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
393  // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
394  if ($mByte > 255) {
395  $outStr .= chr($mByte >> 8 & 255) . chr($mByte & 255);
396  } else {
397  $outStr .= chr($mByte);
398  }
399  } elseif ($useEntityForNoChar) {
400  // Create num entity:
401  $outStr .= '&#' . $this->‪utf8CharToUnumber($buf, true) . ';';
402  } else {
403  $outStr .= chr($this->noCharByteVal);
404  }
405  } else {
406  $outStr .= chr($this->noCharByteVal);
407  }
408  } else {
409  $outStr .= $chr;
410  }
411  }
412  return $outStr;
413  }
414  return '';
415  }
416 
424  public function ‪utf8_to_entities($str)
425  {
426  trigger_error('Method CharsetConverter->utf8_to_entities() will be removed in TYPO3 v10.0. Use native PHP functions instead.', E_USER_DEPRECATED);
427  $strLen = strlen($str);
428  $outStr = '';
429  // Traverse each char in UTF-8 string.
430  for ($a = 0; $a < $strLen; $a++) {
431  $chr = substr($str, $a, 1);
432  $ord = ord($chr);
433  // This means multibyte! (first byte!)
434  if ($ord > 127) {
435  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
436  if ($ord & 64) {
437  // Add first byte
438  $buf = $chr;
439  // For each byte in multibyte string...
440  for ($b = 0; $b < 8; $b++) {
441  // Shift it left and ...
442  $ord = $ord << 1;
443  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
444  if ($ord & 128) {
445  $a++;
446  // ... and add the next char.
447  $buf .= substr($str, $a, 1);
448  } else {
449  break;
450  }
451  }
452  $outStr .= '&#' . $this->‪utf8CharToUnumber($buf, true) . ';';
453  } else {
454  $outStr .= chr($this->noCharByteVal);
455  }
456  } else {
457  $outStr .= $chr;
458  }
459  }
460  return $outStr;
461  }
462 
470  public function ‪entities_to_utf8($str)
471  {
472  trigger_error('Method CharsetConverter->entities_to_utf8() will be removed in TYPO3 v10.0. Use native PHP function html_entity_decode() instead.', E_USER_DEPRECATED);
473  $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
474  $token = md5(microtime());
475  $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
476  foreach ($parts as $k => $v) {
477  // Only take every second element
478  if ($k % 2 === 0) {
479  continue;
480  }
481  $position = 0;
482  // Dec or hex entities
483  if (substr($v, $position, 1) === '#') {
484  $position++;
485  if (substr($v, $position, 1) === 'x') {
486  $v = hexdec(substr($v, ++$position));
487  } else {
488  $v = substr($v, $position);
489  }
490  $parts[$k] = $this->‪UnumberToChar($v);
491  } elseif (isset($trans_tbl['&' . $v . ';'])) {
492  // Other entities:
493  $v = $trans_tbl['&' . $v . ';'];
494  $parts[$k] = $v;
495  } else {
496  // No conversion:
497  $parts[$k] = '&' . $v . ';';
498  }
499  }
500  return implode('', $parts);
501  }
502 
511  public function ‪utf8_to_numberarray($str)
512  {
513  // Entities must be registered as well
514  $str = html_entity_decode($str, ENT_COMPAT, 'utf-8');
515 
516  // Do conversion:
517  $strLen = strlen($str);
518  $outArr = [];
519  // Traverse each char in UTF-8 string.
520  for ($a = 0; $a < $strLen; $a++) {
521  $chr = substr($str, $a, 1);
522  $ord = ord($chr);
523  // This means multibyte! (first byte!)
524  if ($ord > 127) {
525  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
526  if ($ord & 64) {
527  // Add first byte
528  $buf = $chr;
529  // For each byte in multibyte string...
530  for ($b = 0; $b < 8; $b++) {
531  // Shift it left and ...
532  $ord = $ord << 1;
533  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
534  if ($ord & 128) {
535  $a++;
536  // ... and add the next char.
537  $buf .= substr($str, $a, 1);
538  } else {
539  break;
540  }
541  }
542  $outArr[] = $buf;
543  } else {
544  $outArr[] = chr($this->noCharByteVal);
545  }
546  } else {
547  $outArr[] = chr($ord);
548  }
549  }
550  return $outArr;
551  }
552 
573  public function ‪UnumberToChar($unicodeInteger)
574  {
575  $str = '';
576  if ($unicodeInteger < 128) {
577  $str .= chr($unicodeInteger);
578  } elseif ($unicodeInteger < 2048) {
579  $str .= chr(192 | $unicodeInteger >> 6);
580  $str .= chr(128 | $unicodeInteger & 63);
581  } elseif ($unicodeInteger < 65536) {
582  $str .= chr(224 | $unicodeInteger >> 12);
583  $str .= chr(128 | $unicodeInteger >> 6 & 63);
584  $str .= chr(128 | $unicodeInteger & 63);
585  } elseif ($unicodeInteger < 2097152) {
586  $str .= chr(240 | $unicodeInteger >> 18);
587  $str .= chr(128 | $unicodeInteger >> 12 & 63);
588  $str .= chr(128 | $unicodeInteger >> 6 & 63);
589  $str .= chr(128 | $unicodeInteger & 63);
590  } elseif ($unicodeInteger < 67108864) {
591  $str .= chr(248 | $unicodeInteger >> 24);
592  $str .= chr(128 | $unicodeInteger >> 18 & 63);
593  $str .= chr(128 | $unicodeInteger >> 12 & 63);
594  $str .= chr(128 | $unicodeInteger >> 6 & 63);
595  $str .= chr(128 | $unicodeInteger & 63);
596  } elseif ($unicodeInteger < 2147483648) {
597  $str .= chr(252 | $unicodeInteger >> 30);
598  $str .= chr(128 | $unicodeInteger >> 24 & 63);
599  $str .= chr(128 | $unicodeInteger >> 18 & 63);
600  $str .= chr(128 | $unicodeInteger >> 12 & 63);
601  $str .= chr(128 | $unicodeInteger >> 6 & 63);
602  $str .= chr(128 | $unicodeInteger & 63);
603  } else {
604  // Cannot express a 32-bit character in UTF-8
605  $str .= chr($this->noCharByteVal);
606  }
607  return $str;
608  }
609 
619  public function ‪utf8CharToUnumber($str, $hex = false)
620  {
621  // First char
622  $ord = ord($str[0]);
623  // This verifies that it IS a multi byte string
624  if (($ord & 192) === 192) {
625  $binBuf = '';
626  $b = 0;
627  // For each byte in multibyte string...
628  for (; $b < 8; $b++) {
629  // Shift it left and ...
630  $ord = $ord << 1;
631  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
632  if ($ord & 128) {
633  $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
634  } else {
635  break;
636  }
637  }
638  $binBuf = substr('00000000' . decbin(ord($str[0])), -(6 - $b)) . $binBuf;
639  $int = bindec($binBuf);
640  } else {
641  $int = $ord;
642  }
643  return $hex ? 'x' . dechex($int) : $int;
644  }
645 
646  /********************************************
647  *
648  * Init functions
649  *
650  ********************************************/
661  protected function ‪initCharset($charset)
662  {
663  // Only process if the charset is not yet loaded:
664  if (empty($this->parsedCharsets[$charset])) {
665  // Conversion table filename:
666  $charsetConvTableFile = ‪ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
667  // If the conversion table is found:
668  if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
669  // Cache file for charsets:
670  // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
671  $cacheFile = ‪Environment::getVarPath() . '/charset/charset_' . $charset . '.tbl';
672  if ($cacheFile && @is_file($cacheFile)) {
673  $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile), ['allowed_classes' => false]);
674  } else {
675  // Parse conversion table into lines:
676  $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
677  // Initialize the internal variable holding the conv. table:
678  $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
679  // traverse the lines:
680  $detectedType = '';
681  foreach ($lines as $value) {
682  // Comment line or blanks are ignored.
683  if (trim($value) && $value[0] !== '#') {
684  // Detect type if not done yet: (Done on first real line)
685  // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
686  if (!$detectedType) {
687  $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
688  }
689  $hexbyte = '';
690  $utf8 = '';
691  if ($detectedType === 'ms-token') {
692  list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
693  } elseif ($detectedType === 'whitespaced') {
694  $regA = [];
695  preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
696  $hexbyte = $regA[1];
697  $utf8 = 'U+' . $regA[2];
698  }
699  $decval = hexdec(trim($hexbyte));
700  if ($decval > 127) {
701  $utf8decval = hexdec(substr(trim($utf8), 2));
702  $this->parsedCharsets[$charset]['local'][$decval] = $this->‪UnumberToChar($utf8decval);
703  $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
704  }
705  }
706  }
707  if ($cacheFile) {
708  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
709  }
710  }
711  return 2;
712  }
713  throw new UnknownCharsetException(sprintf('Unknown charset "%s"', $charset), 1508916031);
714  }
715  return 1;
716  }
717 
725  protected function ‪initUnicodeData()
726  {
727  // Cache file
728  $cacheFileASCII = ‪Environment::getVarPath() . '/charset/csascii_utf-8.tbl';
729  // Only process if the tables are not yet loaded
730  if (isset($this->toASCII['utf-8']) && is_array($this->toASCII['utf-8'])) {
731  return 1;
732  }
733  // Use cached version if possible
734  if ($cacheFileASCII && @is_file($cacheFileASCII)) {
735  $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII), ['allowed_classes' => false]);
736  return 2;
737  }
738  // Process main Unicode data file
739  $unicodeDataFile = ‪ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
740  if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
741  return false;
742  }
743  $fh = fopen($unicodeDataFile, 'rb');
744  if (!$fh) {
745  return false;
746  }
747  // Array of temp. decompositions
748  $decomposition = [];
749  // Array of chars that are marks (eg. composing accents)
750  $mark = [];
751  // Array of chars that are numbers (eg. digits)
752  $number = [];
753  // Array of chars to be omitted (eg. Russian hard sign)
754  $omit = [];
755  while (!feof($fh)) {
756  $line = fgets($fh, 4096);
757  // Has a lot of info
758  list($char, $name, $cat, , , $decomp, , , $num) = explode(';', rtrim($line));
759  $ord = hexdec($char);
760  if ($ord > 65535) {
761  // Only process the BMP
762  break;
763  }
764  switch ($cat[0]) {
765  case 'M':
766  // mark (accent, umlaut, ...)
767  $mark['U+' . $char] = 1;
768  break;
769  case 'N':
770  // numeric value
771  if ($ord > 128 && $num !== '') {
772  $number['U+' . $char] = $num;
773  }
774  }
775  // Accented Latin letters without "official" decomposition
776  $match = [];
777  if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
778  $c = ord($match[2]);
779  if ($match[1] === 'SMALL') {
780  $c += 32;
781  }
782  $decomposition['U+' . $char] = [dechex($c)];
783  continue;
784  }
785  $match = [];
786  if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
787  switch ($match[1]) {
788  case '<circle>':
789  // add parenthesis as circle replacement, eg (1)
790  $match[2] = '0028 ' . $match[2] . ' 0029';
791  break;
792  case '<square>':
793  // add square brackets as square replacement, eg [1]
794  $match[2] = '005B ' . $match[2] . ' 005D';
795  break;
796  case '<compat>':
797  // ignore multi char decompositions that start with a space
798  if (preg_match('/^0020 /', $match[2])) {
799  continue 2;
800  }
801  break;
802  case '<initial>':
803  case '<medial>':
804  case '<final>':
805  case '<isolated>':
806  case '<vertical>':
807  continue 2;
808  }
809  $decomposition['U+' . $char] = explode(' ', $match[2]);
810  }
811  }
812  fclose($fh);
813  // Process custom decompositions
814  $customTranslitFile = ‪ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
815  if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
816  $fh = fopen($customTranslitFile, 'rb');
817  if ($fh) {
818  while (!feof($fh)) {
819  $line = fgets($fh, 4096);
820  if ($line === false) {
821  continue;
822  }
823  if ($line[0] !== '#' && trim($line) !== '') {
824  list($char, $translit) = GeneralUtility::trimExplode(';', $line);
825  if (!$translit) {
826  $omit['U+' . $char] = 1;
827  }
828  $decomposition['U+' . $char] = explode(' ', $translit);
829  }
830  }
831  fclose($fh);
832  }
833  }
834  // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
835  foreach ($decomposition as $from => $to) {
836  $code_decomp = [];
837  while ($code_value = array_shift($to)) {
838  // Do recursive decomposition
839  if (isset($decomposition['U+' . $code_value])) {
840  foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
841  array_unshift($to, $cv);
842  }
843  } elseif (!isset($mark['U+' . $code_value])) {
844  // remove mark
845  $code_decomp[] = $code_value;
846  }
847  }
848  if (!empty($code_decomp) || isset($omit[$from])) {
849  $decomposition[$from] = $code_decomp;
850  } else {
851  unset($decomposition[$from]);
852  }
853  }
854  // Create ascii only mapping
855  $this->toASCII['utf-8'] = [];
856  foreach ($decomposition as $from => $to) {
857  $code_decomp = [];
858  while ($code_value = array_shift($to)) {
859  $ord = hexdec($code_value);
860  if ($ord > 127) {
861  continue 2;
862  }
863  // Skip decompositions containing non-ASCII chars
864  $code_decomp[] = chr($ord);
865  }
866  $this->toASCII['utf-8'][$this->‪UnumberToChar(hexdec(substr($from, 2)))] = implode('', $code_decomp);
867  }
868  // Add numeric decompositions
869  foreach ($number as $from => $to) {
870  $utf8_char = $this->‪UnumberToChar(hexdec(substr($from, 2)));
871  if (!isset($this->toASCII['utf-8'][$utf8_char])) {
872  $this->toASCII['utf-8'][$utf8_char] = $to;
873  }
874  }
875  if ($cacheFileASCII) {
876  GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($this->toASCII['utf-8']));
877  }
878  return 3;
879  }
880 
888  protected function ‪initToASCII($charset)
889  {
890  // Only process if the case table is not yet loaded:
891  if (isset($this->toASCII[$charset]) && is_array($this->toASCII[$charset])) {
892  return 1;
893  }
894  // Use cached version if possible
895  $cacheFile = ‪Environment::getVarPath() . '/charset/csascii_' . $charset . '.tbl';
896  if ($cacheFile && @is_file($cacheFile)) {
897  $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile), ['allowed_classes' => false]);
898  return 2;
899  }
900  // Init UTF-8 conversion for this charset
901  if (!$this->‪initCharset($charset)) {
902  return false;
903  }
904  // UTF-8/ASCII transliteration is used as the base conversion table
905  if (!$this->‪initUnicodeData()) {
906  return false;
907  }
908  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
909  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
910  $c = $this->‪utf8_decode($utf8, $charset);
911  if (isset($this->toASCII['utf-8'][$utf8])) {
912  $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
913  }
914  }
915  if ($cacheFile) {
916  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
917  }
918  return 3;
919  }
920 
921  /********************************************
922  *
923  * String operation functions
924  *
925  ********************************************/
926 
939  public function ‪crop($charset, $string, $len, $crop = '')
940  {
941  trigger_error('Method CharsetConverter->crop() will be removed in TYPO3 v10.0. Use native PHP mbstring functions instead.', E_USER_DEPRECATED);
942  if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
943  return $string;
944  }
945  if ($len > 0) {
946  $string = mb_substr($string, 0, $len, $charset) . $crop;
947  } else {
948  $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
949  }
950  return $string;
951  }
952 
962  public function ‪convCaseFirst($charset, $string, $case)
963  {
964  trigger_error('Method CharsetConverter->convCaseFirst() will be removed in TYPO3 v10.0. Use native PHP mbstring functions instead.', E_USER_DEPRECATED);
965  $firstChar = mb_substr($string, 0, 1, $charset);
966  $firstChar = $case === 'toLower'
967  ? mb_strtolower($firstChar, $charset)
968  : mb_strtoupper($firstChar, $charset);
969  $remainder = mb_substr($string, 1, null, $charset);
970  return $firstChar . $remainder;
971  }
972 
980  public function ‪specCharsToASCII($charset, $string)
981  {
982  if ($charset === 'utf-8') {
983  $string = $this->‪utf8_char_mapping($string);
984  } elseif (isset($this->eucBasedSets[$charset])) {
985  $string = $this->‪euc_char_mapping($string, $charset);
986  } else {
987  // Treat everything else as single-byte encoding
988  $string = $this->‪sb_char_mapping($string, $charset);
989  }
990  return $string;
991  }
992 
993  /********************************************
994  *
995  * Internal string operation functions
996  *
997  ********************************************/
1005  public function ‪sb_char_mapping($str, $charset)
1006  {
1007  if (!$this->‪initToASCII($charset)) {
1008  return $str;
1009  }
1010  // Do nothing
1011  $map = &$this->toASCII[$charset];
1012  $out = '';
1013  for ($i = 0; isset($str[$i]); $i++) {
1014  $c = $str[$i];
1015  if (isset($map[$c])) {
1016  $out .= $map[$c];
1017  } else {
1018  $out .= $c;
1019  }
1020  }
1021  return $out;
1022  }
1023 
1024  /********************************************
1025  *
1026  * Internal UTF-8 string operation functions
1027  *
1028  ********************************************/
1029 
1039  public function ‪utf8_char2byte_pos($str, $pos)
1040  {
1041  trigger_error('Method CharsetConverter->utf8_char2byte_pos() will be removed in TYPO3 v10.0.', E_USER_DEPRECATED);
1042  // Number of characters found
1043  $n = 0;
1044  // Number of characters wanted
1045  $p = abs($pos);
1046  if ($pos >= 0) {
1047  $i = 0;
1048  $d = 1;
1049  } else {
1050  $i = strlen($str) - 1;
1051  $d = -1;
1052  }
1053  for (; isset($str[$i]) && $n < $p; $i += $d) {
1054  $c = (int)ord($str[$i]);
1055  // single-byte (0xxxxxx)
1056  if (!($c & 128)) {
1057  $n++;
1058  } elseif (($c & 192) === 192) {
1059  // Multi-byte starting byte (11xxxxxx)
1060  $n++;
1061  }
1062  }
1063  if (!isset($str[$i])) {
1064  // Offset beyond string length
1065  return false;
1066  }
1067  if ($pos >= 0) {
1068  // Skip trailing multi-byte data bytes
1069  while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1070  $i++;
1071  }
1072  } else {
1073  // Correct offset
1074  $i++;
1075  }
1076  return $i;
1077  }
1085  public function ‪utf8_char_mapping($str)
1086  {
1087  if (!$this->‪initUnicodeData()) {
1088  // Do nothing
1089  return $str;
1090  }
1091  $out = '';
1092  $map = &$this->toASCII['utf-8'];
1093  for ($i = 0; isset($str[$i]); $i++) {
1094  $c = ord($str[$i]);
1095  $mbc = '';
1096  // single-byte (0xxxxxx)
1097  if (!($c & 128)) {
1098  $mbc = $str[$i];
1099  } elseif (($c & 192) === 192) {
1100  $bc = 0;
1101  // multi-byte starting byte (11xxxxxx)
1102  for (; $c & 128; $c = $c << 1) {
1103  $bc++;
1104  }
1105  // calculate number of bytes
1106  $mbc = substr($str, $i, $bc);
1107  $i += $bc - 1;
1108  }
1109  if (isset($map[$mbc])) {
1110  $out .= $map[$mbc];
1111  } else {
1112  $out .= $mbc;
1113  }
1114  }
1115  return $out;
1116  }
1117 
1118  /********************************************
1119  *
1120  * Internal EUC string operation functions
1121  *
1122  * Extended Unix Code:
1123  * ASCII compatible 7bit single bytes chars
1124  * 8bit two byte chars
1125  *
1126  * Shift-JIS is treated as a special case.
1127  *
1128  ********************************************/
1129 
1137  public function ‪euc_char_mapping($str, $charset)
1138  {
1139  if (!$this->‪initToASCII($charset)) {
1140  return $str;
1141  }
1142  // do nothing
1143  $map = &$this->toASCII[$charset];
1144  $out = '';
1145  for ($i = 0; isset($str[$i]); $i++) {
1146  $mbc = $str[$i];
1147  $c = ord($mbc);
1148  if ($charset === 'shift_jis') {
1149  // A double-byte char
1150  if ($c >= 128 && $c < 160 || $c >= 224) {
1151  $mbc = substr($str, $i, 2);
1152  $i++;
1153  }
1154  } else {
1155  // A double-byte char
1156  if ($c >= 128) {
1157  $mbc = substr($str, $i, 2);
1158  $i++;
1159  }
1160  }
1161  if (isset($map[$mbc])) {
1162  $out .= $map[$mbc];
1163  } else {
1164  $out .= $mbc;
1165  }
1166  }
1167  return $out;
1168  }
1169 }
‪TYPO3\CMS\Core\Charset\CharsetConverter\euc_char_mapping
‪string euc_char_mapping($str, $charset)
Definition: CharsetConverter.php:1130
‪TYPO3\CMS\Core\Charset\CharsetConverter\utf8_encode
‪string utf8_encode($str, $charset)
Definition: CharsetConverter.php:288
‪TYPO3\CMS\Core\Charset\CharsetConverter\$noCharByteVal
‪int $noCharByteVal
Definition: CharsetConverter.php:73
‪TYPO3\CMS\Core\Charset\CharsetConverter\UnumberToChar
‪string UnumberToChar($unicodeInteger)
Definition: CharsetConverter.php:566
‪TYPO3\CMS\Core\Charset\CharsetConverter\$parsedCharsets
‪array $parsedCharsets
Definition: CharsetConverter.php:79
‪TYPO3\CMS\Core\Charset\CharsetConverter\utf8_to_entities
‪string utf8_to_entities($str)
Definition: CharsetConverter.php:417
‪TYPO3\CMS\Core\Charset\CharsetConverter\$twoByteSets
‪array $twoByteSets
Definition: CharsetConverter.php:91
‪TYPO3\CMS\Core\Charset\CharsetConverter\utf8CharToUnumber
‪int utf8CharToUnumber($str, $hex=false)
Definition: CharsetConverter.php:612
‪TYPO3\CMS\Core\Charset\CharsetConverter\utf8_to_numberarray
‪array utf8_to_numberarray($str)
Definition: CharsetConverter.php:504
‪TYPO3\CMS\Core\Charset\CharsetConverter
Definition: CharsetConverter.php:54
‪TYPO3\CMS\Core\Charset\CharsetConverter\initCharset
‪int initCharset($charset)
Definition: CharsetConverter.php:654
‪TYPO3\CMS\Core\Charset\CharsetConverter\parse_charset
‪string parse_charset($charset)
Definition: CharsetConverter.php:204
‪TYPO3\CMS\Core\Utility\ExtensionManagementUtility
Definition: ExtensionManagementUtility.php:36
‪TYPO3\CMS\Core\Charset
Definition: CharsetConverter.php:2
‪TYPO3\CMS\Core\Charset\CharsetConverter\initUnicodeData
‪int initUnicodeData()
Definition: CharsetConverter.php:718
‪TYPO3\CMS\Core\Charset\CharsetConverter\utf8_decode
‪string utf8_decode($str, $charset, $useEntityForNoChar=false)
Definition: CharsetConverter.php:349
‪TYPO3\CMS\Core\Charset\CharsetConverter\$eucBasedSets
‪array $eucBasedSets
Definition: CharsetConverter.php:99
‪TYPO3\CMS\Core\Charset\CharsetConverter\convCaseFirst
‪string convCaseFirst($charset, $string, $case)
Definition: CharsetConverter.php:955
‪TYPO3\CMS\Core\Charset\UnknownCharsetException
Definition: UnknownCharsetException.php:23
‪TYPO3\CMS\Core\Charset\CharsetConverter\specCharsToASCII
‪string specCharsToASCII($charset, $string)
Definition: CharsetConverter.php:973
‪TYPO3\CMS\Core\Charset\CharsetConverter\entities_to_utf8
‪string entities_to_utf8($str)
Definition: CharsetConverter.php:463
‪TYPO3\CMS\Core\Charset\CharsetConverter\crop
‪string crop($charset, $string, $len, $crop='')
Definition: CharsetConverter.php:932
‪TYPO3\CMS\Core\Charset\CharsetConverter\$toASCII
‪array $toASCII
Definition: CharsetConverter.php:85
‪TYPO3\CMS\Core\Charset\CharsetConverter\convArray
‪convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar=false)
Definition: CharsetConverter.php:269
‪TYPO3\CMS\Core\SingletonInterface
Definition: SingletonInterface.php:22
‪TYPO3\CMS\Core\Charset\CharsetConverter\utf8_char_mapping
‪string utf8_char_mapping($str)
Definition: CharsetConverter.php:1078
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:39
‪TYPO3\CMS\Core\Charset\CharsetConverter\initToASCII
‪int initToASCII($charset)
Definition: CharsetConverter.php:881
‪TYPO3\CMS\Core\Charset\CharsetConverter\$synonyms
‪array $synonyms
Definition: CharsetConverter.php:112
‪TYPO3\CMS\Core\Charset\CharsetConverter\conv
‪string conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar=null)
Definition: CharsetConverter.php:229
‪TYPO3\CMS\Core\Charset\CharsetConverter\$deprecatedPublicProperties
‪array $deprecatedPublicProperties
Definition: CharsetConverter.php:60
‪TYPO3\CMS\Core\Utility\ExtensionManagementUtility\extPath
‪static string extPath($key, $script='')
Definition: ExtensionManagementUtility.php:149
‪TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait
Definition: PublicPropertyDeprecationTrait.php:66
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:45
‪TYPO3\CMS\Core\Charset\CharsetConverter\utf8_char2byte_pos
‪int utf8_char2byte_pos($str, $pos)
Definition: CharsetConverter.php:1032
‪TYPO3\CMS\Core\Charset\CharsetConverter\sb_char_mapping
‪string sb_char_mapping($str, $charset)
Definition: CharsetConverter.php:998
‪TYPO3\CMS\Core\Core\Environment\getVarPath
‪static string getVarPath()
Definition: Environment.php:165