TYPO3 CMS  TYPO3_8-7
CharsetConverter.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
22 
56 {
62  public $noCharByteVal = 63;
63 
69  public $parsedCharsets = [];
70 
76  public $caseFolding = [];
77 
83  public $toASCII = [];
84 
90  public $twoByteSets = [
91  'ucs-2' => 1
92  ];
93 
100  public $fourByteSets = [
101  'ucs-4' => 1, // 4-byte Unicode
102  'utf-32' => 1
103  ];
104 
110  public $eucBasedSets = [
111  'gb2312' => 1, // Chinese, simplified.
112  'big5' => 1, // Chinese, traditional.
113  'euc-kr' => 1, // Korean
114  'shift_jis' => 1
115  ];
116 
123  public $synonyms = [
124  'us' => 'ascii',
125  'us-ascii' => 'ascii',
126  'cp819' => 'iso-8859-1',
127  'ibm819' => 'iso-8859-1',
128  'iso-ir-100' => 'iso-8859-1',
129  'iso-ir-101' => 'iso-8859-2',
130  'iso-ir-109' => 'iso-8859-3',
131  'iso-ir-110' => 'iso-8859-4',
132  'iso-ir-144' => 'iso-8859-5',
133  'iso-ir-127' => 'iso-8859-6',
134  'iso-ir-126' => 'iso-8859-7',
135  'iso-ir-138' => 'iso-8859-8',
136  'iso-ir-148' => 'iso-8859-9',
137  'iso-ir-157' => 'iso-8859-10',
138  'iso-ir-179' => 'iso-8859-13',
139  'iso-ir-199' => 'iso-8859-14',
140  'iso-ir-203' => 'iso-8859-15',
141  'csisolatin1' => 'iso-8859-1',
142  'csisolatin2' => 'iso-8859-2',
143  'csisolatin3' => 'iso-8859-3',
144  'csisolatin5' => 'iso-8859-9',
145  'csisolatin8' => 'iso-8859-14',
146  'csisolatin9' => 'iso-8859-15',
147  'csisolatingreek' => 'iso-8859-7',
148  'iso-celtic' => 'iso-8859-14',
149  'latin1' => 'iso-8859-1',
150  'latin2' => 'iso-8859-2',
151  'latin3' => 'iso-8859-3',
152  'latin5' => 'iso-8859-9',
153  'latin6' => 'iso-8859-10',
154  'latin8' => 'iso-8859-14',
155  'latin9' => 'iso-8859-15',
156  'l1' => 'iso-8859-1',
157  'l2' => 'iso-8859-2',
158  'l3' => 'iso-8859-3',
159  'l5' => 'iso-8859-9',
160  'l6' => 'iso-8859-10',
161  'l8' => 'iso-8859-14',
162  'l9' => 'iso-8859-15',
163  'cyrillic' => 'iso-8859-5',
164  'arabic' => 'iso-8859-6',
165  'tis-620' => 'iso-8859-11',
166  'win874' => 'windows-874',
167  'win1250' => 'windows-1250',
168  'win1251' => 'windows-1251',
169  'win1252' => 'windows-1252',
170  'win1253' => 'windows-1253',
171  'win1254' => 'windows-1254',
172  'win1255' => 'windows-1255',
173  'win1256' => 'windows-1256',
174  'win1257' => 'windows-1257',
175  'win1258' => 'windows-1258',
176  'cp1250' => 'windows-1250',
177  'cp1251' => 'windows-1251',
178  'cp1252' => 'windows-1252',
179  'ms-ee' => 'windows-1250',
180  'ms-ansi' => 'windows-1252',
181  'ms-greek' => 'windows-1253',
182  'ms-turk' => 'windows-1254',
183  'winbaltrim' => 'windows-1257',
184  'koi-8ru' => 'koi-8r',
185  'koi8r' => 'koi-8r',
186  'cp878' => 'koi-8r',
187  'mac' => 'macroman',
188  'macintosh' => 'macroman',
189  'euc-cn' => 'gb2312',
190  'x-euc-cn' => 'gb2312',
191  'euccn' => 'gb2312',
192  'cp936' => 'gb2312',
193  'big-5' => 'big5',
194  'cp950' => 'big5',
195  'eucjp' => 'euc-jp',
196  'sjis' => 'shift_jis',
197  'shift-jis' => 'shift_jis',
198  'cp932' => 'shift_jis',
199  'cp949' => 'euc-kr',
200  'utf7' => 'utf-7',
201  'utf8' => 'utf-8',
202  'utf16' => 'utf-16',
203  'utf32' => 'utf-32',
204  'ucs2' => 'ucs-2',
205  'ucs4' => 'ucs-4'
206  ];
207 
215  public $charSetArray = [
216  'af' => '',
217  'ar' => 'iso-8859-6',
218  'ba' => 'iso-8859-2',
219  'bg' => 'windows-1251',
220  'br' => '',
221  'ca' => 'iso-8859-15',
222  'ch' => 'gb2312',
223  'cs' => 'windows-1250',
224  'cz' => 'windows-1250',
225  'da' => '',
226  'de' => '',
227  'dk' => '',
228  'el' => 'iso-8859-7',
229  'eo' => 'utf-8',
230  'es' => '',
231  'et' => 'iso-8859-4',
232  'eu' => '',
233  'fa' => 'utf-8',
234  'fi' => '',
235  'fo' => 'utf-8',
236  'fr' => '',
237  'fr_CA' => '',
238  'ga' => '',
239  'ge' => 'utf-8',
240  'gl' => '',
241  'gr' => 'iso-8859-7',
242  'he' => 'utf-8',
243  'hi' => 'utf-8',
244  'hk' => 'big5',
245  'hr' => 'windows-1250',
246  'hu' => 'iso-8859-2',
247  'is' => 'utf-8',
248  'it' => '',
249  'ja' => 'shift_jis',
250  'jp' => 'shift_jis',
251  'ka' => 'utf-8',
252  'kl' => 'utf-8',
253  'km' => 'utf-8',
254  'ko' => 'euc-kr',
255  'kr' => 'euc-kr',
256  'lt' => 'windows-1257',
257  'lv' => 'utf-8',
258  'ms' => '',
259  'my' => '',
260  'nl' => '',
261  'no' => '',
262  'pl' => 'iso-8859-2',
263  'pt' => '',
264  'pt_BR' => '',
265  'qc' => '',
266  'ro' => 'iso-8859-2',
267  'ru' => 'windows-1251',
268  'se' => '',
269  'si' => 'windows-1250',
270  'sk' => 'windows-1250',
271  'sl' => 'windows-1250',
272  'sq' => 'utf-8',
273  'sr' => 'utf-8',
274  'sv' => '',
275  'th' => 'iso-8859-11',
276  'tr' => 'iso-8859-9',
277  'ua' => 'windows-1251',
278  'uk' => 'windows-1251',
279  'vi' => 'utf-8',
280  'vn' => 'utf-8',
281  'zh' => 'big5'
282  ];
283 
290  public function parse_charset($charset)
291  {
292  $charset = trim(strtolower($charset));
293  if (isset($this->synonyms[$charset])) {
294  $charset = $this->synonyms[$charset];
295  }
296  return $charset;
297  }
298 
299  /********************************************
300  *
301  * Charset Conversion functions
302  *
303  ********************************************/
314  public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
315  {
316  if ($fromCharset === $toCharset) {
317  return $inputString;
318  }
319  // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
320  if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
321  // Returns FALSE for unsupported charsets
322  $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
323  if (false !== $convertedString) {
324  return $convertedString;
325  }
326  }
327  if ($fromCharset !== 'utf-8') {
328  $inputString = $this->utf8_encode($inputString, $fromCharset);
329  }
330  if ($toCharset !== 'utf-8') {
331  $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
332  }
333  return $inputString;
334  }
335 
346  public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
347  {
348  foreach ($array as $key => $value) {
349  if (is_array($array[$key])) {
350  $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
351  } elseif (is_string($array[$key])) {
352  $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
353  }
354  }
355  }
356 
364  public function utf8_encode($str, $charset)
365  {
366  if ($charset === 'utf-8') {
367  return $str;
368  }
369  // Charset is case-insensitive
370  // Parse conv. table if not already
371  if ($this->initCharset($charset)) {
372  $strLen = strlen($str);
373  $outStr = '';
374  // Traverse each char in string
375  for ($a = 0; $a < $strLen; $a++) {
376  $chr = substr($str, $a, 1);
377  $ord = ord($chr);
378  // If the charset has two bytes per char
379  if (isset($this->twoByteSets[$charset])) {
380  $ord2 = ord($str[$a + 1]);
381  // Assume big endian
382  $ord = $ord << 8 | $ord2;
383  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
384  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
385  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
386  } else {
387  $outStr .= chr($this->noCharByteVal);
388  }
389  // No char exists
390  $a++;
391  } elseif ($ord > 127) {
392  // If char has value over 127 it's a multibyte char in UTF-8
393  // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
394  if (isset($this->eucBasedSets[$charset])) {
395  // Shift-JIS: chars between 160 and 223 are single byte
396  if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
397  $a++;
398  $ord2 = ord(substr($str, $a, 1));
399  $ord = $ord * 256 + $ord2;
400  }
401  }
402  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
403  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
404  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
405  } else {
406  $outStr .= chr($this->noCharByteVal);
407  }
408  } else {
409  $outStr .= $chr;
410  }
411  }
412  return $outStr;
413  }
414  return '';
415  }
416 
425  public function utf8_decode($str, $charset, $useEntityForNoChar = false)
426  {
427  if ($charset === 'utf-8') {
428  return $str;
429  }
430  // Charset is case-insensitive.
431  // Parse conv. table if not already
432  if ($this->initCharset($charset)) {
433  $strLen = strlen($str);
434  $outStr = '';
435  // Traverse each char in UTF-8 string
436  for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
437  $chr = substr($str, $a, 1);
438  $ord = ord($chr);
439  // This means multibyte! (first byte!)
440  if ($ord > 127) {
441  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
442  if ($ord & 64) {
443  // Add first byte
444  $buf = $chr;
445  // For each byte in multibyte string
446  for ($b = 0; $b < 8; $b++) {
447  // Shift it left and
448  $ord = $ord << 1;
449  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
450  if ($ord & 128) {
451  $a++;
452  // ... and add the next char.
453  $buf .= substr($str, $a, 1);
454  } else {
455  break;
456  }
457  }
458  // If the UTF-8 char-sequence is found then...
459  if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
460  // The local number
461  $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
462  // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
463  if ($mByte > 255) {
464  $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
465  } else {
466  $outStr .= chr($mByte);
467  }
468  } elseif ($useEntityForNoChar) {
469  // Create num entity:
470  $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
471  } else {
472  $outStr .= chr($this->noCharByteVal);
473  }
474  } else {
475  $outStr .= chr($this->noCharByteVal);
476  }
477  } else {
478  $outStr .= $chr;
479  }
480  }
481  return $outStr;
482  }
483  return '';
484  }
485 
492  public function utf8_to_entities($str)
493  {
494  $strLen = strlen($str);
495  $outStr = '';
496  // Traverse each char in UTF-8 string.
497  for ($a = 0; $a < $strLen; $a++) {
498  $chr = substr($str, $a, 1);
499  $ord = ord($chr);
500  // This means multibyte! (first byte!)
501  if ($ord > 127) {
502  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
503  if ($ord & 64) {
504  // Add first byte
505  $buf = $chr;
506  // For each byte in multibyte string...
507  for ($b = 0; $b < 8; $b++) {
508  // Shift it left and ...
509  $ord = $ord << 1;
510  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
511  if ($ord & 128) {
512  $a++;
513  // ... and add the next char.
514  $buf .= substr($str, $a, 1);
515  } else {
516  break;
517  }
518  }
519  $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
520  } else {
521  $outStr .= chr($this->noCharByteVal);
522  }
523  } else {
524  $outStr .= $chr;
525  }
526  }
527  return $outStr;
528  }
529 
536  public function entities_to_utf8($str)
537  {
538  $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
539  $token = md5(microtime());
540  $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
541  foreach ($parts as $k => $v) {
542  // Only take every second element
543  if ($k % 2 === 0) {
544  continue;
545  }
546  $position = 0;
547  // Dec or hex entities
548  if (substr($v, $position, 1) === '#') {
549  $position++;
550  if (substr($v, $position, 1) === 'x') {
551  $v = hexdec(substr($v, ++$position));
552  } else {
553  $v = substr($v, $position);
554  }
555  $parts[$k] = $this->UnumberToChar($v);
556  } elseif (isset($trans_tbl['&' . $v . ';'])) {
557  // Other entities:
558  $v = $trans_tbl['&' . $v . ';'];
559  $parts[$k] = $v;
560  } else {
561  // No conversion:
562  $parts[$k] = '&' . $v . ';';
563  }
564  }
565  return implode('', $parts);
566  }
567 
576  public function utf8_to_numberarray($str)
577  {
578  // Entities must be registered as well
579  $str = $this->entities_to_utf8($str);
580 
581  // Do conversion:
582  $strLen = strlen($str);
583  $outArr = [];
584  // Traverse each char in UTF-8 string.
585  for ($a = 0; $a < $strLen; $a++) {
586  $chr = substr($str, $a, 1);
587  $ord = ord($chr);
588  // This means multibyte! (first byte!)
589  if ($ord > 127) {
590  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
591  if ($ord & 64) {
592  // Add first byte
593  $buf = $chr;
594  // For each byte in multibyte string...
595  for ($b = 0; $b < 8; $b++) {
596  // Shift it left and ...
597  $ord = $ord << 1;
598  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
599  if ($ord & 128) {
600  $a++;
601  // ... and add the next char.
602  $buf .= substr($str, $a, 1);
603  } else {
604  break;
605  }
606  }
607  $outArr[] = $buf;
608  } else {
609  $outArr[] = chr($this->noCharByteVal);
610  }
611  } else {
612  $outArr[] = chr($ord);
613  }
614  }
615  return $outArr;
616  }
617 
638  public function UnumberToChar($unicodeInteger)
639  {
640  $str = '';
641  if ($unicodeInteger < 128) {
642  $str .= chr($unicodeInteger);
643  } elseif ($unicodeInteger < 2048) {
644  $str .= chr(192 | $unicodeInteger >> 6);
645  $str .= chr(128 | $unicodeInteger & 63);
646  } elseif ($unicodeInteger < 65536) {
647  $str .= chr(224 | $unicodeInteger >> 12);
648  $str .= chr(128 | $unicodeInteger >> 6 & 63);
649  $str .= chr(128 | $unicodeInteger & 63);
650  } elseif ($unicodeInteger < 2097152) {
651  $str .= chr(240 | $unicodeInteger >> 18);
652  $str .= chr(128 | $unicodeInteger >> 12 & 63);
653  $str .= chr(128 | $unicodeInteger >> 6 & 63);
654  $str .= chr(128 | $unicodeInteger & 63);
655  } elseif ($unicodeInteger < 67108864) {
656  $str .= chr(248 | $unicodeInteger >> 24);
657  $str .= chr(128 | $unicodeInteger >> 18 & 63);
658  $str .= chr(128 | $unicodeInteger >> 12 & 63);
659  $str .= chr(128 | $unicodeInteger >> 6 & 63);
660  $str .= chr(128 | $unicodeInteger & 63);
661  } elseif ($unicodeInteger < 2147483648) {
662  $str .= chr(252 | $unicodeInteger >> 30);
663  $str .= chr(128 | $unicodeInteger >> 24 & 63);
664  $str .= chr(128 | $unicodeInteger >> 18 & 63);
665  $str .= chr(128 | $unicodeInteger >> 12 & 63);
666  $str .= chr(128 | $unicodeInteger >> 6 & 63);
667  $str .= chr(128 | $unicodeInteger & 63);
668  } else {
669  // Cannot express a 32-bit character in UTF-8
670  $str .= chr($this->noCharByteVal);
671  }
672  return $str;
673  }
674 
684  public function utf8CharToUnumber($str, $hex = false)
685  {
686  // First char
687  $ord = ord($str[0]);
688  // This verifies that it IS a multi byte string
689  if (($ord & 192) === 192) {
690  $binBuf = '';
691  $b = 0;
692  // For each byte in multibyte string...
693  for (; $b < 8; $b++) {
694  // Shift it left and ...
695  $ord = $ord << 1;
696  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
697  if ($ord & 128) {
698  $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
699  } else {
700  break;
701  }
702  }
703  $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
704  $int = bindec($binBuf);
705  } else {
706  $int = $ord;
707  }
708  return $hex ? 'x' . dechex($int) : $int;
709  }
710 
711  /********************************************
712  *
713  * Init functions
714  *
715  ********************************************/
726  public function initCharset($charset)
727  {
728  // Only process if the charset is not yet loaded:
729  if (!is_array($this->parsedCharsets[$charset])) {
730  // Conversion table filename:
731  $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
732  // If the conversion table is found:
733  if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
734  // Cache file for charsets:
735  // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
736  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/charset_' . $charset . '.tbl');
737  if ($cacheFile && @is_file($cacheFile)) {
738  $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile));
739  } else {
740  // Parse conversion table into lines:
741  $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
742  // Initialize the internal variable holding the conv. table:
743  $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
744  // traverse the lines:
745  $detectedType = '';
746  foreach ($lines as $value) {
747  // Comment line or blanks are ignored.
748  if (trim($value) && $value[0] !== '#') {
749  // Detect type if not done yet: (Done on first real line)
750  // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
751  if (!$detectedType) {
752  $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
753  }
754  $hexbyte = '';
755  $utf8 = '';
756  if ($detectedType === 'ms-token') {
757  list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
758  } elseif ($detectedType === 'whitespaced') {
759  $regA = [];
760  preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
761  $hexbyte = $regA[1];
762  $utf8 = 'U+' . $regA[2];
763  }
764  $decval = hexdec(trim($hexbyte));
765  if ($decval > 127) {
766  $utf8decval = hexdec(substr(trim($utf8), 2));
767  $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
768  $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
769  }
770  }
771  }
772  if ($cacheFile) {
773  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
774  }
775  }
776  return 2;
777  }
778  GeneralUtility::makeInstance(LogManager::class)
779  ->getLogger(__CLASS__)
780  ->warning('Unknown charset "' . $charset . '" used for settings like config.metaCharset.');
781  return false;
782  }
783  return 1;
784  }
785 
795  public function initUnicodeData($mode = null)
796  {
797  // Cache files
798  $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_utf-8.tbl');
799  $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_utf-8.tbl');
800  // Only process if the tables are not yet loaded
801  switch ($mode) {
802  case 'case':
803  if (is_array($this->caseFolding['utf-8'])) {
804  return 1;
805  }
806  // Use cached version if possible
807  if ($cacheFileCase && @is_file($cacheFileCase)) {
808  $this->caseFolding['utf-8'] = unserialize(file_get_contents($cacheFileCase));
809  return 2;
810  }
811  break;
812  case 'ascii':
813  if (is_array($this->toASCII['utf-8'])) {
814  return 1;
815  }
816  // Use cached version if possible
817  if ($cacheFileASCII && @is_file($cacheFileASCII)) {
818  $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII));
819  return 2;
820  }
821  break;
822  }
823  // Process main Unicode data file
824  $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
825  if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
826  return false;
827  }
828  $fh = fopen($unicodeDataFile, 'rb');
829  if (!$fh) {
830  return false;
831  }
832  // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
833  // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
834  $this->caseFolding['utf-8'] = [];
835  $utf8CaseFolding = &$this->caseFolding['utf-8'];
836  // a shorthand
837  $utf8CaseFolding['toUpper'] = [];
838  $utf8CaseFolding['toLower'] = [];
839  $utf8CaseFolding['toTitle'] = [];
840  // Array of temp. decompositions
841  $decomposition = [];
842  // Array of chars that are marks (eg. composing accents)
843  $mark = [];
844  // Array of chars that are numbers (eg. digits)
845  $number = [];
846  // Array of chars to be omitted (eg. Russian hard sign)
847  $omit = [];
848  while (!feof($fh)) {
849  $line = fgets($fh, 4096);
850  // Has a lot of info
851  list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
852  $ord = hexdec($char);
853  if ($ord > 65535) {
854  // Only process the BMP
855  break;
856  }
857  $utf8_char = $this->UnumberToChar($ord);
858  if ($upper) {
859  $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
860  }
861  if ($lower) {
862  $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
863  }
864  // Store "title" only when different from "upper" (only a few)
865  if ($title && $title !== $upper) {
866  $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
867  }
868  switch ($cat[0]) {
869  case 'M':
870  // mark (accent, umlaut, ...)
871  $mark['U+' . $char] = 1;
872  break;
873  case 'N':
874  // numeric value
875  if ($ord > 128 && $num !== '') {
876  $number['U+' . $char] = $num;
877  }
878  }
879  // Accented Latin letters without "official" decomposition
880  $match = [];
881  if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
882  $c = ord($match[2]);
883  if ($match[1] === 'SMALL') {
884  $c += 32;
885  }
886  $decomposition['U+' . $char] = [dechex($c)];
887  continue;
888  }
889  $match = [];
890  if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
891  switch ($match[1]) {
892  case '<circle>':
893  // add parenthesis as circle replacement, eg (1)
894  $match[2] = '0028 ' . $match[2] . ' 0029';
895  break;
896  case '<square>':
897  // add square brackets as square replacement, eg [1]
898  $match[2] = '005B ' . $match[2] . ' 005D';
899  break;
900  case '<compat>':
901  // ignore multi char decompositions that start with a space
902  if (preg_match('/^0020 /', $match[2])) {
903  continue 2;
904  }
905  break;
906  case '<initial>':
907  case '<medial>':
908  case '<final>':
909  case '<isolated>':
910  case '<vertical>':
911  continue 2;
912  }
913  $decomposition['U+' . $char] = explode(' ', $match[2]);
914  }
915  }
916  fclose($fh);
917  // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
918  $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
919  if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
920  $fh = fopen($specialCasingFile, 'rb');
921  if ($fh) {
922  while (!feof($fh)) {
923  $line = fgets($fh, 4096);
924  if ($line[0] !== '#' && trim($line) !== '') {
925  list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
926  if ($cond === '' || $cond[0] === '#') {
927  $utf8_char = $this->UnumberToChar(hexdec($char));
928  if ($char !== $lower) {
929  $arr = explode(' ', $lower);
930  for ($i = 0; isset($arr[$i]); $i++) {
931  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
932  }
933  $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
934  }
935  if ($char !== $title && $title !== $upper) {
936  $arr = explode(' ', $title);
937  for ($i = 0; isset($arr[$i]); $i++) {
938  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
939  }
940  $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
941  }
942  if ($char !== $upper) {
943  $arr = explode(' ', $upper);
944  for ($i = 0; isset($arr[$i]); $i++) {
945  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
946  }
947  $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
948  }
949  }
950  }
951  }
952  fclose($fh);
953  }
954  }
955  // Process custom decompositions
956  $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
957  if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
958  $fh = fopen($customTranslitFile, 'rb');
959  if ($fh) {
960  while (!feof($fh)) {
961  $line = fgets($fh, 4096);
962  if ($line === false) {
963  continue;
964  }
965  if ($line[0] !== '#' && trim($line) !== '') {
966  list($char, $translit) = GeneralUtility::trimExplode(';', $line);
967  if (!$translit) {
968  $omit['U+' . $char] = 1;
969  }
970  $decomposition['U+' . $char] = explode(' ', $translit);
971  }
972  }
973  fclose($fh);
974  }
975  }
976  // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
977  foreach ($decomposition as $from => $to) {
978  $code_decomp = [];
979  while ($code_value = array_shift($to)) {
980  // Do recursive decomposition
981  if (isset($decomposition['U+' . $code_value])) {
982  foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
983  array_unshift($to, $cv);
984  }
985  } elseif (!isset($mark['U+' . $code_value])) {
986  // remove mark
987  $code_decomp[] = $code_value;
988  }
989  }
990  if (!empty($code_decomp) || isset($omit[$from])) {
991  $decomposition[$from] = $code_decomp;
992  } else {
993  unset($decomposition[$from]);
994  }
995  }
996  // Create ascii only mapping
997  $this->toASCII['utf-8'] = [];
998  $ascii = &$this->toASCII['utf-8'];
999  foreach ($decomposition as $from => $to) {
1000  $code_decomp = [];
1001  while ($code_value = array_shift($to)) {
1002  $ord = hexdec($code_value);
1003  if ($ord > 127) {
1004  continue 2;
1005  }
1006  // Skip decompositions containing non-ASCII chars
1007  $code_decomp[] = chr($ord);
1008  }
1009  $ascii[$this->UnumberToChar(hexdec(substr($from, 2)))] = implode('', $code_decomp);
1010  }
1011  // Add numeric decompositions
1012  foreach ($number as $from => $to) {
1013  $utf8_char = $this->UnumberToChar(hexdec(substr($from, 2)));
1014  if (!isset($ascii[$utf8_char])) {
1015  $ascii[$utf8_char] = $to;
1016  }
1017  }
1018  if ($cacheFileCase) {
1019  GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1020  }
1021  if ($cacheFileASCII) {
1022  GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1023  }
1024  return 3;
1025  }
1026 
1035  public function initCaseFolding($charset)
1036  {
1037  // Only process if the case table is not yet loaded:
1038  if (is_array($this->caseFolding[$charset])) {
1039  return 1;
1040  }
1041  // Use cached version if possible
1042  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_' . $charset . '.tbl');
1043  if ($cacheFile && @is_file($cacheFile)) {
1044  $this->caseFolding[$charset] = unserialize(file_get_contents($cacheFile));
1045  return 2;
1046  }
1047  // init UTF-8 conversion for this charset
1048  if (!$this->initCharset($charset)) {
1049  return false;
1050  }
1051  // UTF-8 case folding is used as the base conversion table
1052  if (!$this->initUnicodeData('case')) {
1053  return false;
1054  }
1055  $nochar = chr($this->noCharByteVal);
1056  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1057  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1058  $c = $this->utf8_decode($utf8, $charset);
1059  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1060  if ($cc !== '' && $cc !== $nochar) {
1061  $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1062  }
1063  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1064  if ($cc !== '' && $cc !== $nochar) {
1065  $this->caseFolding[$charset]['toLower'][$c] = $cc;
1066  }
1067  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1068  if ($cc !== '' && $cc !== $nochar) {
1069  $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1070  }
1071  }
1072  // Add the ASCII case table
1073  $start = ord('a');
1074  $end = ord('z');
1075  for ($i = $start; $i <= $end; $i++) {
1076  $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1077  }
1078  $start = ord('A');
1079  $end = ord('Z');
1080  for ($i = $start; $i <= $end; $i++) {
1081  $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1082  }
1083  if ($cacheFile) {
1084  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1085  }
1086  return 3;
1087  }
1088 
1097  public function initToASCII($charset)
1098  {
1099  // Only process if the case table is not yet loaded:
1100  if (is_array($this->toASCII[$charset])) {
1101  return 1;
1102  }
1103  // Use cached version if possible
1104  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_' . $charset . '.tbl');
1105  if ($cacheFile && @is_file($cacheFile)) {
1106  $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile));
1107  return 2;
1108  }
1109  // Init UTF-8 conversion for this charset
1110  if (!$this->initCharset($charset)) {
1111  return false;
1112  }
1113  // UTF-8/ASCII transliteration is used as the base conversion table
1114  if (!$this->initUnicodeData('ascii')) {
1115  return false;
1116  }
1117  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1118  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1119  $c = $this->utf8_decode($utf8, $charset);
1120  if (isset($this->toASCII['utf-8'][$utf8])) {
1121  $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1122  }
1123  }
1124  if ($cacheFile) {
1125  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1126  }
1127  return 3;
1128  }
1129 
1130  /********************************************
1131  *
1132  * String operation functions
1133  *
1134  ********************************************/
1147  public function substr($charset, $string, $start, $len = null)
1148  {
1150  return mb_substr($string, $start, $len, $charset);
1151  }
1152 
1163  public function strlen($charset, $string)
1164  {
1166  return mb_strlen($string, $charset);
1167  }
1168 
1180  public function crop($charset, $string, $len, $crop = '')
1181  {
1182  if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1183  return $string;
1184  }
1185  if ($len > 0) {
1186  $string = mb_substr($string, 0, $len, $charset) . $crop;
1187  } else {
1188  $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1189  }
1190  return $string;
1191  }
1192 
1203  public function strtrunc($charset, $string, $len)
1204  {
1206  if ($len <= 0) {
1207  return '';
1208  }
1209  return mb_strcut($string, 0, $len, $charset);
1210  }
1211 
1227  public function conv_case($charset, $string, $case)
1228  {
1230  return $case === 'toLower'
1231  ? mb_strtolower($string, $charset)
1232  : mb_strtoupper($string, $charset);
1233  }
1234 
1243  public function convCaseFirst($charset, $string, $case)
1244  {
1245  $firstChar = mb_substr($string, 0, 1, $charset);
1246  $firstChar = $case === 'toLower'
1247  ? mb_strtolower($firstChar, $charset)
1248  : mb_strtoupper($firstChar, $charset);
1249  $remainder = mb_substr($string, 1, null, $charset);
1250  return $firstChar . $remainder;
1251  }
1252 
1261  public function convCapitalize($charset, $string)
1262  {
1264  return mb_convert_case($string, MB_CASE_TITLE, $charset);
1265  }
1266 
1274  public function specCharsToASCII($charset, $string)
1275  {
1276  if ($charset === 'utf-8') {
1277  $string = $this->utf8_char_mapping($string, 'ascii');
1278  } elseif (isset($this->eucBasedSets[$charset])) {
1279  $string = $this->euc_char_mapping($string, $charset, 'ascii');
1280  } else {
1281  // Treat everything else as single-byte encoding
1282  $string = $this->sb_char_mapping($string, $charset, 'ascii');
1283  }
1284  return $string;
1285  }
1286 
1295  public function getPreferredClientLanguage($languageCodesList)
1296  {
1299  $locales = GeneralUtility::makeInstance(Locales::class);
1300  return $locales->getPreferredClientLanguage($languageCodesList);
1301  }
1302 
1303  /********************************************
1304  *
1305  * Internal string operation functions
1306  *
1307  ********************************************/
1317  public function sb_char_mapping($str, $charset, $mode, $opt = '')
1318  {
1319  switch ($mode) {
1320  case 'case':
1321  if (!$this->initCaseFolding($charset)) {
1322  return $str;
1323  }
1324  // Do nothing
1325  $map = &$this->caseFolding[$charset][$opt];
1326  break;
1327  case 'ascii':
1328  if (!$this->initToASCII($charset)) {
1329  return $str;
1330  }
1331  // Do nothing
1332  $map = &$this->toASCII[$charset];
1333  break;
1334  default:
1335  return $str;
1336  }
1337  $out = '';
1338  for ($i = 0; isset($str[$i]); $i++) {
1339  $c = $str[$i];
1340  if (isset($map[$c])) {
1341  $out .= $map[$c];
1342  } else {
1343  $out .= $c;
1344  }
1345  }
1346  return $out;
1347  }
1348 
1349  /********************************************
1350  *
1351  * Internal UTF-8 string operation functions
1352  *
1353  ********************************************/
1365  public function utf8_substr($str, $start, $len = null)
1366  {
1368  if ((string)$len === '0') {
1369  return '';
1370  }
1371  $byte_start = $this->utf8_char2byte_pos($str, $start);
1372  if ($byte_start === false) {
1373  if ($start > 0) {
1374  // $start outside string length
1375  return false;
1376  }
1377  }
1378  $str = substr($str, $byte_start);
1379  if ($len != null) {
1380  $byte_end = $this->utf8_char2byte_pos($str, $len);
1381  // $len outside actual string length
1382  if ($byte_end === false) {
1383  return $len < 0 ? '' : $str;
1384  }
1385  // When length is less than zero and exceeds, then we return blank string.
1386  return substr($str, 0, $byte_end);
1387  }
1388  return $str;
1389  }
1390 
1400  public function utf8_strlen($str)
1401  {
1403  $n = 0;
1404  for ($i = 0; isset($str[$i]); $i++) {
1405  $c = ord($str[$i]);
1406  // Single-byte (0xxxxxx)
1407  if (!($c & 128)) {
1408  $n++;
1409  } elseif (($c & 192) === 192) {
1410  // Multi-byte starting byte (11xxxxxx)
1411  $n++;
1412  }
1413  }
1414  return $n;
1415  }
1416 
1426  public function utf8_strtrunc($str, $len)
1427  {
1429  $i = $len - 1;
1430  // Part of a multibyte sequence
1431  if (ord($str[$i]) & 128) {
1432  for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1433  }
1434  if ($i <= 0) {
1435  return '';
1436  }
1437  $bc = 0;
1438  // Sanity check
1439  for ($mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1440  // Calculate number of bytes
1441  $bc++;
1442  }
1443  if ($bc + $i > $len) {
1444  return substr($str, 0, $i);
1445  }
1446  }
1447  return substr($str, 0, $len);
1448  }
1449 
1460  public function utf8_strpos($haystack, $needle, $offset = 0)
1461  {
1463  return mb_strpos($haystack, $needle, $offset, 'utf-8');
1464  }
1465 
1475  public function utf8_strrpos($haystack, $needle)
1476  {
1478  return mb_strrpos($haystack, $needle, 'utf-8');
1479  }
1480 
1489  public function utf8_char2byte_pos($str, $pos)
1490  {
1491  // Number of characters found
1492  $n = 0;
1493  // Number of characters wanted
1494  $p = abs($pos);
1495  if ($pos >= 0) {
1496  $i = 0;
1497  $d = 1;
1498  } else {
1499  $i = strlen($str) - 1;
1500  $d = -1;
1501  }
1502  for (; isset($str[$i]) && $n < $p; $i += $d) {
1503  $c = (int)ord($str[$i]);
1504  // single-byte (0xxxxxx)
1505  if (!($c & 128)) {
1506  $n++;
1507  } elseif (($c & 192) === 192) {
1508  // Multi-byte starting byte (11xxxxxx)
1509  $n++;
1510  }
1511  }
1512  if (!isset($str[$i])) {
1513  // Offset beyond string length
1514  return false;
1515  }
1516  if ($pos >= 0) {
1517  // Skip trailing multi-byte data bytes
1518  while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1519  $i++;
1520  }
1521  } else {
1522  // Correct offset
1523  $i++;
1524  }
1525  return $i;
1526  }
1527 
1537  public function utf8_byte2char_pos($str, $pos)
1538  {
1540  // Number of characters
1541  $n = 0;
1542  $i = $pos;
1543  for (; $i > 0; $i--) {
1544  $c = (int)ord($str[$i]);
1545  // single-byte (0xxxxxx)
1546  if (!($c & 128)) {
1547  $n++;
1548  } elseif (($c & 192) === 192) {
1549  // Multi-byte starting byte (11xxxxxx)
1550  $n++;
1551  }
1552  }
1553  if (!isset($str[$i])) {
1554  // Offset beyond string length
1555  return false;
1556  }
1557  return $n;
1558  }
1559 
1568  public function utf8_char_mapping($str, $mode, $opt = '')
1569  {
1570  if (!$this->initUnicodeData($mode)) {
1571  // Do nothing
1572  return $str;
1573  }
1574  $out = '';
1575  switch ($mode) {
1576  case 'case':
1577  $map = &$this->caseFolding['utf-8'][$opt];
1578  break;
1579  case 'ascii':
1580  $map = &$this->toASCII['utf-8'];
1581  break;
1582  default:
1583  return $str;
1584  }
1585  for ($i = 0; isset($str[$i]); $i++) {
1586  $c = ord($str[$i]);
1587  $mbc = '';
1588  // single-byte (0xxxxxx)
1589  if (!($c & 128)) {
1590  $mbc = $str[$i];
1591  } elseif (($c & 192) === 192) {
1592  $bc = 0;
1593  // multi-byte starting byte (11xxxxxx)
1594  for (; $c & 128; $c = $c << 1) {
1595  $bc++;
1596  }
1597  // calculate number of bytes
1598  $mbc = substr($str, $i, $bc);
1599  $i += $bc - 1;
1600  }
1601  if (isset($map[$mbc])) {
1602  $out .= $map[$mbc];
1603  } else {
1604  $out .= $mbc;
1605  }
1606  }
1607  return $out;
1608  }
1609 
1610  /********************************************
1611  *
1612  * Internal EUC string operation functions
1613  *
1614  * Extended Unix Code:
1615  * ASCII compatible 7bit single bytes chars
1616  * 8bit two byte chars
1617  *
1618  * Shift-JIS is treated as a special case.
1619  *
1620  ********************************************/
1631  public function euc_strtrunc($str, $len, $charset)
1632  {
1634  $shiftJis = $charset === 'shift_jis';
1635  $i = 0;
1636  for (; isset($str[$i]) && $i < $len; $i++) {
1637  $c = ord($str[$i]);
1638  if ($shiftJis) {
1639  if ($c >= 128 && $c < 160 || $c >= 224) {
1640  $i++;
1641  }
1642  } else {
1643  if ($c >= 128) {
1644  $i++;
1645  }
1646  }
1647  }
1648  if (!isset($str[$i])) {
1649  return $str;
1650  }
1651  // string shorter than supplied length
1652  if ($i > $len) {
1653  // We ended on a first byte
1654  return substr($str, 0, $len - 1);
1655  }
1656  return substr($str, 0, $len);
1657  }
1658 
1669  public function euc_substr($str, $start, $charset, $len = null)
1670  {
1672  $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
1673  if ($byte_start === false) {
1674  // $start outside string length
1675  return false;
1676  }
1677  $str = substr($str, $byte_start);
1678  if ($len != null) {
1679  $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
1680  // $len outside actual string length
1681  if ($byte_end === false) {
1682  return $str;
1683  }
1684  return substr($str, 0, $byte_end);
1685  }
1686  return $str;
1687  }
1688 
1698  public function euc_strlen($str, $charset)
1699  {
1701  $sjis = $charset === 'shift_jis';
1702  $n = 0;
1703  for ($i = 0; isset($str[$i]); $i++) {
1704  $c = ord($str[$i]);
1705  if ($sjis) {
1706  if ($c >= 128 && $c < 160 || $c >= 224) {
1707  $i++;
1708  }
1709  } else {
1710  if ($c >= 128) {
1711  $i++;
1712  }
1713  }
1714  $n++;
1715  }
1716  return $n;
1717  }
1718 
1728  public function euc_char2byte_pos($str, $pos, $charset)
1729  {
1731  $sjis = $charset === 'shift_jis';
1732  // Number of characters seen
1733  $n = 0;
1734  // Number of characters wanted
1735  $p = abs($pos);
1736  if ($pos >= 0) {
1737  $i = 0;
1738  $d = 1;
1739  } else {
1740  $i = strlen($str) - 1;
1741  $d = -1;
1742  }
1743  for (; isset($str[$i]) && $n < $p; $i += $d) {
1744  $c = ord($str[$i]);
1745  if ($sjis) {
1746  if ($c >= 128 && $c < 160 || $c >= 224) {
1747  $i += $d;
1748  }
1749  } else {
1750  if ($c >= 128) {
1751  $i += $d;
1752  }
1753  }
1754  $n++;
1755  }
1756  if (!isset($str[$i])) {
1757  return false;
1758  }
1759  // offset beyond string length
1760  if ($pos < 0) {
1761  $i++;
1762  }
1763  // correct offset
1764  return $i;
1765  }
1766 
1776  public function euc_char_mapping($str, $charset, $mode, $opt = '')
1777  {
1778  switch ($mode) {
1779  case 'case':
1780  if (!$this->initCaseFolding($charset)) {
1781  return $str;
1782  }
1783  // do nothing
1784  $map = &$this->caseFolding[$charset][$opt];
1785  break;
1786  case 'ascii':
1787  if (!$this->initToASCII($charset)) {
1788  return $str;
1789  }
1790  // do nothing
1791  $map = &$this->toASCII[$charset];
1792  break;
1793  default:
1794  return $str;
1795  }
1796  $sjis = $charset === 'shift_jis';
1797  $out = '';
1798  for ($i = 0; isset($str[$i]); $i++) {
1799  $mbc = $str[$i];
1800  $c = ord($mbc);
1801  if ($sjis) {
1802  // A double-byte char
1803  if ($c >= 128 && $c < 160 || $c >= 224) {
1804  $mbc = substr($str, $i, 2);
1805  $i++;
1806  }
1807  } else {
1808  // A double-byte char
1809  if ($c >= 128) {
1810  $mbc = substr($str, $i, 2);
1811  $i++;
1812  }
1813  }
1814  if (isset($map[$mbc])) {
1815  $out .= $map[$mbc];
1816  } else {
1817  $out .= $mbc;
1818  }
1819  }
1820  return $out;
1821  }
1822 }
utf8_strpos($haystack, $needle, $offset=0)
convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar=false)
euc_char_mapping($str, $charset, $mode, $opt='')
static writeFileToTypo3tempDir($filepath, $content)
static getFileAbsFileName($filename, $_=null, $_2=null)
static trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
static makeInstance($className,... $constructorArguments)
crop($charset, $string, $len, $crop='')
euc_substr($str, $start, $charset, $len=null)
sb_char_mapping($str, $charset, $mode, $opt='')
substr($charset, $string, $start, $len=null)
conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar=false)
utf8_decode($str, $charset, $useEntityForNoChar=false)
$locales
Definition: be_users.php:6