TYPO3 CMS  TYPO3_7-6
CharsetConverter.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
20 
54 {
60  public $noCharByteVal = 63;
61 
67  public $parsedCharsets = [];
68 
74  public $caseFolding = [];
75 
81  public $toASCII = [];
82 
88  public $twoByteSets = [
89  'ucs-2' => 1
90  ];
91 
97  public $fourByteSets = [
98  'ucs-4' => 1, // 4-byte Unicode
99  'utf-32' => 1
100  ];
101 
107  public $eucBasedSets = [
108  'gb2312' => 1, // Chinese, simplified.
109  'big5' => 1, // Chinese, traditional.
110  'euc-kr' => 1, // Korean
111  'shift_jis' => 1
112  ];
113 
120  public $synonyms = [
121  'us' => 'ascii',
122  'us-ascii' => 'ascii',
123  'cp819' => 'iso-8859-1',
124  'ibm819' => 'iso-8859-1',
125  'iso-ir-100' => 'iso-8859-1',
126  'iso-ir-101' => 'iso-8859-2',
127  'iso-ir-109' => 'iso-8859-3',
128  'iso-ir-110' => 'iso-8859-4',
129  'iso-ir-144' => 'iso-8859-5',
130  'iso-ir-127' => 'iso-8859-6',
131  'iso-ir-126' => 'iso-8859-7',
132  'iso-ir-138' => 'iso-8859-8',
133  'iso-ir-148' => 'iso-8859-9',
134  'iso-ir-157' => 'iso-8859-10',
135  'iso-ir-179' => 'iso-8859-13',
136  'iso-ir-199' => 'iso-8859-14',
137  'iso-ir-203' => 'iso-8859-15',
138  'csisolatin1' => 'iso-8859-1',
139  'csisolatin2' => 'iso-8859-2',
140  'csisolatin3' => 'iso-8859-3',
141  'csisolatin5' => 'iso-8859-9',
142  'csisolatin8' => 'iso-8859-14',
143  'csisolatin9' => 'iso-8859-15',
144  'csisolatingreek' => 'iso-8859-7',
145  'iso-celtic' => 'iso-8859-14',
146  'latin1' => 'iso-8859-1',
147  'latin2' => 'iso-8859-2',
148  'latin3' => 'iso-8859-3',
149  'latin5' => 'iso-8859-9',
150  'latin6' => 'iso-8859-10',
151  'latin8' => 'iso-8859-14',
152  'latin9' => 'iso-8859-15',
153  'l1' => 'iso-8859-1',
154  'l2' => 'iso-8859-2',
155  'l3' => 'iso-8859-3',
156  'l5' => 'iso-8859-9',
157  'l6' => 'iso-8859-10',
158  'l8' => 'iso-8859-14',
159  'l9' => 'iso-8859-15',
160  'cyrillic' => 'iso-8859-5',
161  'arabic' => 'iso-8859-6',
162  'tis-620' => 'iso-8859-11',
163  'win874' => 'windows-874',
164  'win1250' => 'windows-1250',
165  'win1251' => 'windows-1251',
166  'win1252' => 'windows-1252',
167  'win1253' => 'windows-1253',
168  'win1254' => 'windows-1254',
169  'win1255' => 'windows-1255',
170  'win1256' => 'windows-1256',
171  'win1257' => 'windows-1257',
172  'win1258' => 'windows-1258',
173  'cp1250' => 'windows-1250',
174  'cp1251' => 'windows-1251',
175  'cp1252' => 'windows-1252',
176  'ms-ee' => 'windows-1250',
177  'ms-ansi' => 'windows-1252',
178  'ms-greek' => 'windows-1253',
179  'ms-turk' => 'windows-1254',
180  'winbaltrim' => 'windows-1257',
181  'koi-8ru' => 'koi-8r',
182  'koi8r' => 'koi-8r',
183  'cp878' => 'koi-8r',
184  'mac' => 'macroman',
185  'macintosh' => 'macroman',
186  'euc-cn' => 'gb2312',
187  'x-euc-cn' => 'gb2312',
188  'euccn' => 'gb2312',
189  'cp936' => 'gb2312',
190  'big-5' => 'big5',
191  'cp950' => 'big5',
192  'eucjp' => 'euc-jp',
193  'sjis' => 'shift_jis',
194  'shift-jis' => 'shift_jis',
195  'cp932' => 'shift_jis',
196  'cp949' => 'euc-kr',
197  'utf7' => 'utf-7',
198  'utf8' => 'utf-8',
199  'utf16' => 'utf-16',
200  'utf32' => 'utf-32',
201  'ucs2' => 'ucs-2',
202  'ucs4' => 'ucs-4'
203  ];
204 
210  public $lang_to_script = [
211  // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
212  'af' => 'west_european', // Afrikaans
213  'ar' => 'arabic',
214  'bg' => 'cyrillic', // Bulgarian
215  'bs' => 'east_european', // Bosnian
216  'cs' => 'east_european', // Czech
217  'da' => 'west_european', // Danish
218  'de' => 'west_european', // German
219  'es' => 'west_european', // Spanish
220  'et' => 'estonian',
221  'eo' => 'unicode', // Esperanto
222  'eu' => 'west_european', // Basque
223  'fa' => 'arabic', // Persian
224  'fi' => 'west_european', // Finish
225  'fo' => 'west_european', // Faroese
226  'fr' => 'west_european', // French
227  'ga' => 'west_european', // Irish
228  'gl' => 'west_european', // Galician
229  'gr' => 'greek',
230  'he' => 'hebrew', // Hebrew (since 1998)
231  'hi' => 'unicode', // Hindi
232  'hr' => 'east_european', // Croatian
233  'hu' => 'east_european', // Hungarian
234  'iw' => 'hebrew', // Hebrew (til 1998)
235  'is' => 'west_european', // Icelandic
236  'it' => 'west_european', // Italian
237  'ja' => 'japanese',
238  'ka' => 'unicode', // Georgian
239  'kl' => 'west_european', // Greenlandic
240  'km' => 'unicode', // Khmer
241  'ko' => 'korean',
242  'lt' => 'lithuanian',
243  'lv' => 'west_european', // Latvian/Lettish
244  'nl' => 'west_european', // Dutch
245  'no' => 'west_european', // Norwegian
246  'nb' => 'west_european', // Norwegian Bokmal
247  'nn' => 'west_european', // Norwegian Nynorsk
248  'pl' => 'east_european', // Polish
249  'pt' => 'west_european', // Portuguese
250  'ro' => 'east_european', // Romanian
251  'ru' => 'cyrillic', // Russian
252  'sk' => 'east_european', // Slovak
253  'sl' => 'east_european', // Slovenian
254  'sr' => 'cyrillic', // Serbian
255  'sv' => 'west_european', // Swedish
256  'sq' => 'albanian', // Albanian
257  'th' => 'thai',
258  'uk' => 'cyrillic', // Ukranian
259  'vi' => 'vietnamese',
260  'zh' => 'chinese',
261 
262  // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
263  // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
264  'afk' => 'west_european', // Afrikaans
265  'ara' => 'arabic',
266  'bgr' => 'cyrillic', // Bulgarian
267  'cat' => 'west_european', // Catalan
268  'chs' => 'simpl_chinese',
269  'cht' => 'trad_chinese',
270  'csy' => 'east_european', // Czech
271  'dan' => 'west_european', // Danish
272  'deu' => 'west_european', // German
273  'dea' => 'west_european', // German (Austrian)
274  'des' => 'west_european', // German (Swiss)
275  'ena' => 'west_european', // English (Australian)
276  'enc' => 'west_european', // English (Canadian)
277  'eng' => 'west_european', // English
278  'enz' => 'west_european', // English (New Zealand)
279  'enu' => 'west_european', // English (United States)
280  'euq' => 'west_european', // Basque
281  'fos' => 'west_european', // Faroese
282  'far' => 'arabic', // Persian
283  'fin' => 'west_european', // Finish
284  'fra' => 'west_european', // French
285  'frb' => 'west_european', // French (Belgian)
286  'frc' => 'west_european', // French (Canadian)
287  'frs' => 'west_european', // French (Swiss)
288  'geo' => 'unicode', // Georgian
289  'glg' => 'west_european', // Galician
290  'ell' => 'greek',
291  'heb' => 'hebrew',
292  'hin' => 'unicode', // Hindi
293  'hun' => 'east_european', // Hungarian
294  'isl' => 'west_european', // Icelandic
295  'ita' => 'west_european', // Italian
296  'its' => 'west_european', // Italian (Swiss)
297  'jpn' => 'japanese',
298  'khm' => 'unicode', // Khmer
299  'kor' => 'korean',
300  'lth' => 'lithuanian',
301  'lvi' => 'west_european', // Latvian/Lettish
302  'msl' => 'west_european', // Malay
303  'nlb' => 'west_european', // Dutch (Belgian)
304  'nld' => 'west_european', // Dutch
305  'nor' => 'west_european', // Norwegian (bokmal)
306  'non' => 'west_european', // Norwegian (nynorsk)
307  'plk' => 'east_european', // Polish
308  'ptg' => 'west_european', // Portuguese
309  'ptb' => 'west_european', // Portuguese (Brazil)
310  'rom' => 'east_european', // Romanian
311  'rus' => 'cyrillic', // Russian
312  'slv' => 'east_european', // Slovenian
313  'sky' => 'east_european', // Slovak
314  'srl' => 'east_european', // Serbian (Latin)
315  'srb' => 'cyrillic', // Serbian (Cyrillic)
316  'esp' => 'west_european', // Spanish (trad. sort)
317  'esm' => 'west_european', // Spanish (Mexican)
318  'esn' => 'west_european', // Spanish (internat. sort)
319  'sve' => 'west_european', // Swedish
320  'sqi' => 'albanian', // Albanian
321  'tha' => 'thai',
322  'trk' => 'turkish',
323  'ukr' => 'cyrillic', // Ukrainian
324 
325  // English language names
326  'afrikaans' => 'west_european',
327  'albanian' => 'albanian',
328  'arabic' => 'arabic',
329  'basque' => 'west_european',
330  'bosnian' => 'east_european',
331  'bulgarian' => 'east_european',
332  'catalan' => 'west_european',
333  'croatian' => 'east_european',
334  'czech' => 'east_european',
335  'danish' => 'west_european',
336  'dutch' => 'west_european',
337  'english' => 'west_european',
338  'esperanto' => 'unicode',
339  'estonian' => 'estonian',
340  'faroese' => 'west_european',
341  'farsi' => 'arabic',
342  'finnish' => 'west_european',
343  'french' => 'west_european',
344  'galician' => 'west_european',
345  'georgian' => 'unicode',
346  'german' => 'west_european',
347  'greek' => 'greek',
348  'greenlandic' => 'west_european',
349  'hebrew' => 'hebrew',
350  'hindi' => 'unicode',
351  'hungarian' => 'east_european',
352  'icelandic' => 'west_european',
353  'italian' => 'west_european',
354  'khmer' => 'unicode',
355  'latvian' => 'west_european',
356  'lettish' => 'west_european',
357  'lithuanian' => 'lithuanian',
358  'malay' => 'west_european',
359  'norwegian' => 'west_european',
360  'persian' => 'arabic',
361  'polish' => 'east_european',
362  'portuguese' => 'west_european',
363  'russian' => 'cyrillic',
364  'romanian' => 'east_european',
365  'serbian' => 'cyrillic',
366  'slovak' => 'east_european',
367  'slovenian' => 'east_european',
368  'spanish' => 'west_european',
369  'svedish' => 'west_european',
370  'that' => 'thai',
371  'turkish' => 'turkish',
372  'ukrainian' => 'cyrillic'
373  ];
374 
381  'west_european' => 'iso-8859-1',
382  'estonian' => 'iso-8859-1',
383  'east_european' => 'iso-8859-2',
384  'baltic' => 'iso-8859-4',
385  'cyrillic' => 'iso-8859-5',
386  'arabic' => 'iso-8859-6',
387  'greek' => 'iso-8859-7',
388  'hebrew' => 'iso-8859-8',
389  'turkish' => 'iso-8859-9',
390  'thai' => 'iso-8859-11', // = TIS-620
391  'lithuanian' => 'iso-8859-13',
392  'chinese' => 'gb2312', // = euc-cn
393  'japanese' => 'euc-jp',
394  'korean' => 'euc-kr',
395  'simpl_chinese' => 'gb2312',
396  'trad_chinese' => 'big5',
397  'vietnamese' => '',
398  'unicode' => 'utf-8',
399  'albanian' => 'utf-8'
400  ];
401 
408  'east_european' => 'windows-1250',
409  'cyrillic' => 'windows-1251',
410  'west_european' => 'windows-1252',
411  'greek' => 'windows-1253',
412  'turkish' => 'windows-1254',
413  'hebrew' => 'windows-1255',
414  'arabic' => 'windows-1256',
415  'baltic' => 'windows-1257',
416  'estonian' => 'windows-1257',
417  'lithuanian' => 'windows-1257',
418  'vietnamese' => 'windows-1258',
419  'thai' => 'cp874',
420  'korean' => 'cp949',
421  'chinese' => 'gb2312',
422  'japanese' => 'shift_jis',
423  'simpl_chinese' => 'gb2312',
424  'trad_chinese' => 'big5',
425  'albanian' => 'windows-1250',
426  'unicode' => 'utf-8'
427  ];
428 
435  'japanese.euc' => 'euc-jp',
436  'ja_jp.ujis' => 'euc-jp',
437  'korean.euc' => 'euc-kr',
438  'sr@Latn' => 'iso-8859-2',
439  'zh_cn' => 'gb2312',
440  'zh_hk' => 'big5',
441  'zh_tw' => 'big5'
442  ];
443 
450  public $charSetArray = [
451  'af' => '',
452  'ar' => 'iso-8859-6',
453  'ba' => 'iso-8859-2',
454  'bg' => 'windows-1251',
455  'br' => '',
456  'ca' => 'iso-8859-15',
457  'ch' => 'gb2312',
458  'cs' => 'windows-1250',
459  'cz' => 'windows-1250',
460  'da' => '',
461  'de' => '',
462  'dk' => '',
463  'el' => 'iso-8859-7',
464  'eo' => 'utf-8',
465  'es' => '',
466  'et' => 'iso-8859-4',
467  'eu' => '',
468  'fa' => 'utf-8',
469  'fi' => '',
470  'fo' => 'utf-8',
471  'fr' => '',
472  'fr_CA' => '',
473  'ga' => '',
474  'ge' => 'utf-8',
475  'gl' => '',
476  'gr' => 'iso-8859-7',
477  'he' => 'utf-8',
478  'hi' => 'utf-8',
479  'hk' => 'big5',
480  'hr' => 'windows-1250',
481  'hu' => 'iso-8859-2',
482  'is' => 'utf-8',
483  'it' => '',
484  'ja' => 'shift_jis',
485  'jp' => 'shift_jis',
486  'ka' => 'utf-8',
487  'kl' => 'utf-8',
488  'km' => 'utf-8',
489  'ko' => 'euc-kr',
490  'kr' => 'euc-kr',
491  'lt' => 'windows-1257',
492  'lv' => 'utf-8',
493  'ms' => '',
494  'my' => '',
495  'nl' => '',
496  'no' => '',
497  'pl' => 'iso-8859-2',
498  'pt' => '',
499  'pt_BR' => '',
500  'qc' => '',
501  'ro' => 'iso-8859-2',
502  'ru' => 'windows-1251',
503  'se' => '',
504  'si' => 'windows-1250',
505  'sk' => 'windows-1250',
506  'sl' => 'windows-1250',
507  'sq' => 'utf-8',
508  'sr' => 'utf-8',
509  'sv' => '',
510  'th' => 'iso-8859-11',
511  'tr' => 'iso-8859-9',
512  'ua' => 'windows-1251',
513  'uk' => 'windows-1251',
514  'vi' => 'utf-8',
515  'vn' => 'utf-8',
516  'zh' => 'big5'
517  ];
518 
525  public function parse_charset($charset)
526  {
527  $charset = trim(strtolower($charset));
528  if (isset($this->synonyms[$charset])) {
529  $charset = $this->synonyms[$charset];
530  }
531  return $charset;
532  }
533 
545  public function get_locale_charset($locale)
546  {
547  $locale = strtolower($locale);
548  // Exact locale specific charset?
549  if (isset($this->locale_to_charset[$locale])) {
550  return $this->locale_to_charset[$locale];
551  }
552  // Get modifier
553  list($locale, $modifier) = explode('@', $locale);
554  // Locale contains charset: use it
555  list($locale, $charset) = explode('.', $locale);
556  if ($charset) {
557  return $this->parse_charset($charset);
558  }
559  // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
560  if ($modifier === 'euro') {
561  return 'iso-8859-15';
562  }
563  // Get language
564  list($language, ) = explode('_', $locale);
565  if (isset($this->lang_to_script[$language])) {
566  $script = $this->lang_to_script[$language];
567  }
568  if (TYPO3_OS === 'WIN') {
569  $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
570  } else {
571  $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
572  }
573  return $cs;
574  }
575 
576  /********************************************
577  *
578  * Charset Conversion functions
579  *
580  ********************************************/
591  public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
592  {
593  if ($fromCharset === $toCharset) {
594  return $inputString;
595  }
596  // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
597  if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
598  switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
599  case 'mbstring':
600  $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
601  if (false !== $convertedString) {
602  return $convertedString;
603  }
604  // Returns FALSE for unsupported charsets
605  break;
606  case 'iconv':
607  $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
608  if (false !== $convertedString) {
609  return $convertedString;
610  }
611  break;
612  case 'recode':
613  $convertedString = recode_string($fromCharset . '..' . $toCharset, $inputString);
614  if (false !== $convertedString) {
615  return $convertedString;
616  }
617  break;
618  }
619  }
620  if ($fromCharset !== 'utf-8') {
621  $inputString = $this->utf8_encode($inputString, $fromCharset);
622  }
623  if ($toCharset !== 'utf-8') {
624  $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
625  }
626  return $inputString;
627  }
628 
640  public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
641  {
642  foreach ($array as $key => $value) {
643  if (is_array($array[$key])) {
644  $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
645  } elseif (is_string($array[$key])) {
646  $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
647  }
648  }
649  }
650 
658  public function utf8_encode($str, $charset)
659  {
660  if ($charset === 'utf-8') {
661  return $str;
662  }
663  // Charset is case-insensitive
664  // Parse conv. table if not already
665  if ($this->initCharset($charset)) {
666  $strLen = strlen($str);
667  $outStr = '';
668  // Traverse each char in string
669  for ($a = 0; $a < $strLen; $a++) {
670  $chr = substr($str, $a, 1);
671  $ord = ord($chr);
672  // If the charset has two bytes per char
673  if (isset($this->twoByteSets[$charset])) {
674  $ord2 = ord($str[$a + 1]);
675  // Assume big endian
676  $ord = $ord << 8 | $ord2;
677  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
678  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
679  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
680  } else {
681  $outStr .= chr($this->noCharByteVal);
682  }
683  // No char exists
684  $a++;
685  } elseif ($ord > 127) {
686  // If char has value over 127 it's a multibyte char in UTF-8
687  // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
688  if (isset($this->eucBasedSets[$charset])) {
689  // Shift-JIS: chars between 160 and 223 are single byte
690  if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
691  $a++;
692  $ord2 = ord(substr($str, $a, 1));
693  $ord = $ord * 256 + $ord2;
694  }
695  }
696  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
697  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
698  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
699  } else {
700  $outStr .= chr($this->noCharByteVal);
701  }
702  } else {
703  $outStr .= $chr;
704  }
705  }
706  return $outStr;
707  }
708  }
709 
718  public function utf8_decode($str, $charset, $useEntityForNoChar = false)
719  {
720  if ($charset === 'utf-8') {
721  return $str;
722  }
723  // Charset is case-insensitive.
724  // Parse conv. table if not already
725  if ($this->initCharset($charset)) {
726  $strLen = strlen($str);
727  $outStr = '';
728  // Traverse each char in UTF-8 string
729  for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
730  $chr = substr($str, $a, 1);
731  $ord = ord($chr);
732  // This means multibyte! (first byte!)
733  if ($ord > 127) {
734  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
735  if ($ord & 64) {
736  // Add first byte
737  $buf = $chr;
738  // For each byte in multibyte string
739  for ($b = 0; $b < 8; $b++) {
740  // Shift it left and
741  $ord = $ord << 1;
742  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
743  if ($ord & 128) {
744  $a++;
745  // ... and add the next char.
746  $buf .= substr($str, $a, 1);
747  } else {
748  break;
749  }
750  }
751  // If the UTF-8 char-sequence is found then...
752  if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
753  // The local number
754  $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
755  // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
756  if ($mByte > 255) {
757  $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
758  } else {
759  $outStr .= chr($mByte);
760  }
761  } elseif ($useEntityForNoChar) {
762  // Create num entity:
763  $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
764  } else {
765  $outStr .= chr($this->noCharByteVal);
766  }
767  } else {
768  $outStr .= chr($this->noCharByteVal);
769  }
770  } else {
771  $outStr .= $chr;
772  }
773  }
774  return $outStr;
775  }
776  }
777 
784  public function utf8_to_entities($str)
785  {
786  $strLen = strlen($str);
787  $outStr = '';
788  // Traverse each char in UTF-8 string.
789  for ($a = 0; $a < $strLen; $a++) {
790  $chr = substr($str, $a, 1);
791  $ord = ord($chr);
792  // This means multibyte! (first byte!)
793  if ($ord > 127) {
794  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
795  if ($ord & 64) {
796  // Add first byte
797  $buf = $chr;
798  // For each byte in multibyte string...
799  for ($b = 0; $b < 8; $b++) {
800  // Shift it left and ...
801  $ord = $ord << 1;
802  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
803  if ($ord & 128) {
804  $a++;
805  // ... and add the next char.
806  $buf .= substr($str, $a, 1);
807  } else {
808  break;
809  }
810  }
811  $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
812  } else {
813  $outStr .= chr($this->noCharByteVal);
814  }
815  } else {
816  $outStr .= $chr;
817  }
818  }
819  return $outStr;
820  }
821 
829  public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
830  {
831  if ($alsoStdHtmlEnt) {
832  $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
833  }
834  $token = md5(microtime());
835  $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
836  foreach ($parts as $k => $v) {
837  // Only take every second element
838  if ($k % 2 === 0) {
839  continue;
840  }
841  $position = 0;
842  // Dec or hex entities
843  if (substr($v, $position, 1) === '#') {
844  $position++;
845  if (substr($v, $position, 1) === 'x') {
846  $v = hexdec(substr($v, ++$position));
847  } else {
848  $v = substr($v, $position);
849  }
850  $parts[$k] = $this->UnumberToChar($v);
851  } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
852  // Other entities:
853  $v = $trans_tbl['&' . $v . ';'];
854  $parts[$k] = $v;
855  } else {
856  // No conversion:
857  $parts[$k] = '&' . $v . ';';
858  }
859  }
860  return implode('', $parts);
861  }
862 
871  public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
872  {
873  // If entities must be registered as well...:
874  if ($convEntities) {
875  $str = $this->entities_to_utf8($str, 1);
876  }
877  // Do conversion:
878  $strLen = strlen($str);
879  $outArr = [];
880  // Traverse each char in UTF-8 string.
881  for ($a = 0; $a < $strLen; $a++) {
882  $chr = substr($str, $a, 1);
883  $ord = ord($chr);
884  // This means multibyte! (first byte!)
885  if ($ord > 127) {
886  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
887  if ($ord & 64) {
888  // Add first byte
889  $buf = $chr;
890  // For each byte in multibyte string...
891  for ($b = 0; $b < 8; $b++) {
892  // Shift it left and ...
893  $ord = $ord << 1;
894  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
895  if ($ord & 128) {
896  $a++;
897  // ... and add the next char.
898  $buf .= substr($str, $a, 1);
899  } else {
900  break;
901  }
902  }
903  $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
904  } else {
905  $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
906  }
907  } else {
908  $outArr[] = $retChar ? chr($ord) : $ord;
909  }
910  }
911  return $outArr;
912  }
913 
934  public function UnumberToChar($unicodeInteger)
935  {
936  $str = '';
937  if ($unicodeInteger < 128) {
938  $str .= chr($unicodeInteger);
939  } elseif ($unicodeInteger < 2048) {
940  $str .= chr(192 | $unicodeInteger >> 6);
941  $str .= chr(128 | $unicodeInteger & 63);
942  } elseif ($unicodeInteger < 65536) {
943  $str .= chr(224 | $unicodeInteger >> 12);
944  $str .= chr(128 | $unicodeInteger >> 6 & 63);
945  $str .= chr(128 | $unicodeInteger & 63);
946  } elseif ($unicodeInteger < 2097152) {
947  $str .= chr(240 | $unicodeInteger >> 18);
948  $str .= chr(128 | $unicodeInteger >> 12 & 63);
949  $str .= chr(128 | $unicodeInteger >> 6 & 63);
950  $str .= chr(128 | $unicodeInteger & 63);
951  } elseif ($unicodeInteger < 67108864) {
952  $str .= chr(248 | $unicodeInteger >> 24);
953  $str .= chr(128 | $unicodeInteger >> 18 & 63);
954  $str .= chr(128 | $unicodeInteger >> 12 & 63);
955  $str .= chr(128 | $unicodeInteger >> 6 & 63);
956  $str .= chr(128 | $unicodeInteger & 63);
957  } elseif ($unicodeInteger < 2147483648) {
958  $str .= chr(252 | $unicodeInteger >> 30);
959  $str .= chr(128 | $unicodeInteger >> 24 & 63);
960  $str .= chr(128 | $unicodeInteger >> 18 & 63);
961  $str .= chr(128 | $unicodeInteger >> 12 & 63);
962  $str .= chr(128 | $unicodeInteger >> 6 & 63);
963  $str .= chr(128 | $unicodeInteger & 63);
964  } else {
965  // Cannot express a 32-bit character in UTF-8
966  $str .= chr($this->noCharByteVal);
967  }
968  return $str;
969  }
970 
980  public function utf8CharToUnumber($str, $hex = false)
981  {
982  // First char
983  $ord = ord($str[0]);
984  // This verifies that it IS a multi byte string
985  if (($ord & 192) === 192) {
986  $binBuf = '';
987  // For each byte in multibyte string...
988  for ($b = 0; $b < 8; $b++) {
989  // Shift it left and ...
990  $ord = $ord << 1;
991  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
992  if ($ord & 128) {
993  $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
994  } else {
995  break;
996  }
997  }
998  $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
999  $int = bindec($binBuf);
1000  } else {
1001  $int = $ord;
1002  }
1003  return $hex ? 'x' . dechex($int) : $int;
1004  }
1005 
1006  /********************************************
1007  *
1008  * Init functions
1009  *
1010  ********************************************/
1021  public function initCharset($charset)
1022  {
1023  // Only process if the charset is not yet loaded:
1024  if (!is_array($this->parsedCharsets[$charset])) {
1025  // Conversion table filename:
1026  $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1027  // If the conversion table is found:
1028  if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1029  // Cache file for charsets:
1030  // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1031  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1032  if ($cacheFile && @is_file($cacheFile)) {
1033  $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1034  } else {
1035  // Parse conversion table into lines:
1036  $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), true);
1037  // Initialize the internal variable holding the conv. table:
1038  $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
1039  // traverse the lines:
1040  $detectedType = '';
1041  foreach ($lines as $value) {
1042  // Comment line or blanks are ignored.
1043  if (trim($value) && $value[0] !== '#') {
1044  // Detect type if not done yet: (Done on first real line)
1045  // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1046  if (!$detectedType) {
1047  $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1048  }
1049  if ($detectedType === 'ms-token') {
1050  list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1051  } elseif ($detectedType === 'whitespaced') {
1052  $regA = [];
1053  preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1054  $hexbyte = $regA[1];
1055  $utf8 = 'U+' . $regA[2];
1056  }
1057  $decval = hexdec(trim($hexbyte));
1058  if ($decval > 127) {
1059  $utf8decval = hexdec(substr(trim($utf8), 2));
1060  $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1061  $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1062  }
1063  }
1064  }
1065  if ($cacheFile) {
1066  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1067  }
1068  }
1069  return 2;
1070  } else {
1071  return false;
1072  }
1073  } else {
1074  return 1;
1075  }
1076  }
1077 
1087  public function initUnicodeData($mode = null)
1088  {
1089  // Cache files
1090  $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1091  $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1092  // Only process if the tables are not yet loaded
1093  switch ($mode) {
1094  case 'case':
1095  if (is_array($this->caseFolding['utf-8'])) {
1096  return 1;
1097  }
1098  // Use cached version if possible
1099  if ($cacheFileCase && @is_file($cacheFileCase)) {
1100  $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1101  return 2;
1102  }
1103  break;
1104  case 'ascii':
1105  if (is_array($this->toASCII['utf-8'])) {
1106  return 1;
1107  }
1108  // Use cached version if possible
1109  if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1110  $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1111  return 2;
1112  }
1113  break;
1114  }
1115  // Process main Unicode data file
1116  $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1117  if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1118  return false;
1119  }
1120  $fh = fopen($unicodeDataFile, 'rb');
1121  if (!$fh) {
1122  return false;
1123  }
1124  // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1125  // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1126  $this->caseFolding['utf-8'] = [];
1127  $utf8CaseFolding = &$this->caseFolding['utf-8'];
1128  // a shorthand
1129  $utf8CaseFolding['toUpper'] = [];
1130  $utf8CaseFolding['toLower'] = [];
1131  $utf8CaseFolding['toTitle'] = [];
1132  // Array of temp. decompositions
1133  $decomposition = [];
1134  // Array of chars that are marks (eg. composing accents)
1135  $mark = [];
1136  // Array of chars that are numbers (eg. digits)
1137  $number = [];
1138  // Array of chars to be omitted (eg. Russian hard sign)
1139  $omit = [];
1140  while (!feof($fh)) {
1141  $line = fgets($fh, 4096);
1142  // Has a lot of info
1143  list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1144  $ord = hexdec($char);
1145  if ($ord > 65535) {
1146  // Only process the BMP
1147  break;
1148  }
1149  $utf8_char = $this->UnumberToChar($ord);
1150  if ($upper) {
1151  $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1152  }
1153  if ($lower) {
1154  $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1155  }
1156  // Store "title" only when different from "upper" (only a few)
1157  if ($title && $title !== $upper) {
1158  $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1159  }
1160  switch ($cat[0]) {
1161  case 'M':
1162  // mark (accent, umlaut, ...)
1163  $mark['U+' . $char] = 1;
1164  break;
1165  case 'N':
1166  // numeric value
1167  if ($ord > 128 && $num !== '') {
1168  $number['U+' . $char] = $num;
1169  }
1170  }
1171  // Accented Latin letters without "official" decomposition
1172  $match = [];
1173  if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1174  $c = ord($match[2]);
1175  if ($match[1] === 'SMALL') {
1176  $c += 32;
1177  }
1178  $decomposition['U+' . $char] = [dechex($c)];
1179  continue;
1180  }
1181  $match = [];
1182  if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1183  switch ($match[1]) {
1184  case '<circle>':
1185  // add parenthesis as circle replacement, eg (1)
1186  $match[2] = '0028 ' . $match[2] . ' 0029';
1187  break;
1188  case '<square>':
1189  // add square brackets as square replacement, eg [1]
1190  $match[2] = '005B ' . $match[2] . ' 005D';
1191  break;
1192  case '<compat>':
1193  // ignore multi char decompositions that start with a space
1194  if (preg_match('/^0020 /', $match[2])) {
1195  continue 2;
1196  }
1197  break;
1198  case '<initial>':
1199  case '<medial>':
1200  case '<final>':
1201  case '<isolated>':
1202  case '<vertical>':
1203  continue 2;
1204  }
1205  $decomposition['U+' . $char] = explode(' ', $match[2]);
1206  }
1207  }
1208  fclose($fh);
1209  // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1210  $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1211  if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1212  $fh = fopen($specialCasingFile, 'rb');
1213  if ($fh) {
1214  while (!feof($fh)) {
1215  $line = fgets($fh, 4096);
1216  if ($line[0] !== '#' && trim($line) !== '') {
1217  list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1218  if ($cond === '' || $cond[0] === '#') {
1219  $utf8_char = $this->UnumberToChar(hexdec($char));
1220  if ($char !== $lower) {
1221  $arr = explode(' ', $lower);
1222  for ($i = 0; isset($arr[$i]); $i++) {
1223  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1224  }
1225  $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1226  }
1227  if ($char !== $title && $title !== $upper) {
1228  $arr = explode(' ', $title);
1229  for ($i = 0; isset($arr[$i]); $i++) {
1230  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1231  }
1232  $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1233  }
1234  if ($char !== $upper) {
1235  $arr = explode(' ', $upper);
1236  for ($i = 0; isset($arr[$i]); $i++) {
1237  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1238  }
1239  $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1240  }
1241  }
1242  }
1243  }
1244  fclose($fh);
1245  }
1246  }
1247  // Process custom decompositions
1248  $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1249  if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1250  $fh = fopen($customTranslitFile, 'rb');
1251  if ($fh) {
1252  while (!feof($fh)) {
1253  $line = fgets($fh, 4096);
1254  if ($line[0] !== '#' && trim($line) !== '') {
1255  list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1256  if (!$translit) {
1257  $omit['U+' . $char] = 1;
1258  }
1259  $decomposition['U+' . $char] = explode(' ', $translit);
1260  }
1261  }
1262  fclose($fh);
1263  }
1264  }
1265  // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1266  foreach ($decomposition as $from => $to) {
1267  $code_decomp = [];
1268  while ($code_value = array_shift($to)) {
1269  // Do recursive decomposition
1270  if (isset($decomposition['U+' . $code_value])) {
1271  foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1272  array_unshift($to, $cv);
1273  }
1274  } elseif (!isset($mark['U+' . $code_value])) {
1275  // remove mark
1276  array_push($code_decomp, $code_value);
1277  }
1278  }
1279  if (!empty($code_decomp) || isset($omit[$from])) {
1280  $decomposition[$from] = $code_decomp;
1281  } else {
1282  unset($decomposition[$from]);
1283  }
1284  }
1285  // Create ascii only mapping
1286  $this->toASCII['utf-8'] = [];
1287  $ascii = &$this->toASCII['utf-8'];
1288  foreach ($decomposition as $from => $to) {
1289  $code_decomp = [];
1290  while ($code_value = array_shift($to)) {
1291  $ord = hexdec($code_value);
1292  if ($ord > 127) {
1293  continue 2;
1294  } else {
1295  // Skip decompositions containing non-ASCII chars
1296  array_push($code_decomp, chr($ord));
1297  }
1298  }
1299  $ascii[$this->UnumberToChar(hexdec($from))] = implode('', $code_decomp);
1300  }
1301  // Add numeric decompositions
1302  foreach ($number as $from => $to) {
1303  $utf8_char = $this->UnumberToChar(hexdec($from));
1304  if (!isset($ascii[$utf8_char])) {
1305  $ascii[$utf8_char] = $to;
1306  }
1307  }
1308  if ($cacheFileCase) {
1309  GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1310  }
1311  if ($cacheFileASCII) {
1312  GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1313  }
1314  return 3;
1315  }
1316 
1325  public function initCaseFolding($charset)
1326  {
1327  // Only process if the case table is not yet loaded:
1328  if (is_array($this->caseFolding[$charset])) {
1329  return 1;
1330  }
1331  // Use cached version if possible
1332  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1333  if ($cacheFile && @is_file($cacheFile)) {
1334  $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1335  return 2;
1336  }
1337  // init UTF-8 conversion for this charset
1338  if (!$this->initCharset($charset)) {
1339  return false;
1340  }
1341  // UTF-8 case folding is used as the base conversion table
1342  if (!$this->initUnicodeData('case')) {
1343  return false;
1344  }
1345  $nochar = chr($this->noCharByteVal);
1346  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1347  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1348  $c = $this->utf8_decode($utf8, $charset);
1349  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1350  if ($cc !== '' && $cc !== $nochar) {
1351  $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1352  }
1353  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1354  if ($cc !== '' && $cc !== $nochar) {
1355  $this->caseFolding[$charset]['toLower'][$c] = $cc;
1356  }
1357  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1358  if ($cc !== '' && $cc !== $nochar) {
1359  $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1360  }
1361  }
1362  // Add the ASCII case table
1363  $start = ord('a');
1364  $end = ord('z');
1365  for ($i = $start; $i <= $end; $i++) {
1366  $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1367  }
1368  $start = ord('A');
1369  $end = ord('Z');
1370  for ($i = $start; $i <= $end; $i++) {
1371  $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1372  }
1373  if ($cacheFile) {
1374  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1375  }
1376  return 3;
1377  }
1378 
1387  public function initToASCII($charset)
1388  {
1389  // Only process if the case table is not yet loaded:
1390  if (is_array($this->toASCII[$charset])) {
1391  return 1;
1392  }
1393  // Use cached version if possible
1394  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1395  if ($cacheFile && @is_file($cacheFile)) {
1396  $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1397  return 2;
1398  }
1399  // Init UTF-8 conversion for this charset
1400  if (!$this->initCharset($charset)) {
1401  return false;
1402  }
1403  // UTF-8/ASCII transliteration is used as the base conversion table
1404  if (!$this->initUnicodeData('ascii')) {
1405  return false;
1406  }
1407  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1408  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1409  $c = $this->utf8_decode($utf8, $charset);
1410  if (isset($this->toASCII['utf-8'][$utf8])) {
1411  $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1412  }
1413  }
1414  if ($cacheFile) {
1415  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1416  }
1417  return 3;
1418  }
1419 
1420  /********************************************
1421  *
1422  * String operation functions
1423  *
1424  ********************************************/
1436  public function substr($charset, $string, $start, $len = null)
1437  {
1438  if ($len === 0 || $string === '') {
1439  return '';
1440  }
1441  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1442  // Cannot omit $len, when specifying charset
1443  if ($len === null) {
1444  // Save internal encoding
1445  $enc = mb_internal_encoding();
1446  mb_internal_encoding($charset);
1447  $str = mb_substr($string, $start);
1448  // Restore internal encoding
1449  mb_internal_encoding($enc);
1450  return $str;
1451  } else {
1452  return mb_substr($string, $start, $len, $charset);
1453  }
1454  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1455  // Cannot omit $len, when specifying charset
1456  if ($len === null) {
1457  // Save internal encoding
1458  $enc = iconv_get_encoding('internal_encoding');
1459  iconv_set_encoding('internal_encoding', $charset);
1460  $str = iconv_substr($string, $start);
1461  // Restore internal encoding
1462  iconv_set_encoding('internal_encoding', $enc);
1463  return $str;
1464  } else {
1465  return iconv_substr($string, $start, $len, $charset);
1466  }
1467  } elseif ($charset === 'utf-8') {
1468  return $this->utf8_substr($string, $start, $len);
1469  } elseif ($this->eucBasedSets[$charset]) {
1470  return $this->euc_substr($string, $start, $charset, $len);
1471  } elseif ($this->twoByteSets[$charset]) {
1472  return substr($string, $start * 2, $len * 2);
1473  } elseif ($this->fourByteSets[$charset]) {
1474  return substr($string, $start * 4, $len * 4);
1475  }
1476  // Treat everything else as single-byte encoding
1477  return $len === null ? substr($string, $start) : substr($string, $start, $len);
1478  }
1479 
1489  public function strlen($charset, $string)
1490  {
1491  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1492  return mb_strlen($string, $charset);
1493  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1494  return iconv_strlen($string, $charset);
1495  } elseif ($charset === 'utf-8') {
1496  return $this->utf8_strlen($string);
1497  } elseif ($this->eucBasedSets[$charset]) {
1498  return $this->euc_strlen($string, $charset);
1499  } elseif ($this->twoByteSets[$charset]) {
1500  return strlen($string) / 2;
1501  } elseif ($this->fourByteSets[$charset]) {
1502  return strlen($string) / 4;
1503  }
1504  // Treat everything else as single-byte encoding
1505  return strlen($string);
1506  }
1507 
1518  protected function cropMbstring($charset, $string, $len, $crop = '')
1519  {
1520  if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1521  return $string;
1522  }
1523  if ($len > 0) {
1524  $string = mb_substr($string, 0, $len, $charset) . $crop;
1525  } else {
1526  $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1527  }
1528  return $string;
1529  }
1530 
1542  public function crop($charset, $string, $len, $crop = '')
1543  {
1544  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1545  return $this->cropMbstring($charset, $string, $len, $crop);
1546  }
1547  if ((int)$len === 0) {
1548  return $string;
1549  }
1550  if ($charset === 'utf-8') {
1551  $i = $this->utf8_char2byte_pos($string, $len);
1552  } elseif ($this->eucBasedSets[$charset]) {
1553  $i = $this->euc_char2byte_pos($string, $len, $charset);
1554  } else {
1555  if ($len > 0) {
1556  $i = $len;
1557  } else {
1558  $i = strlen($string) + $len;
1559  if ($i <= 0) {
1560  $i = false;
1561  }
1562  }
1563  }
1564  // $len outside actual string length
1565  if ($i === false) {
1566  return $string;
1567  } else {
1568  if ($len > 0) {
1569  if (isset($string[$i])) {
1570  return substr($string, 0, $i) . $crop;
1571  }
1572  } else {
1573  if (isset($string[$i - 1])) {
1574  return $crop . substr($string, $i);
1575  }
1576  }
1577  }
1578  return $string;
1579  }
1580 
1590  public function strtrunc($charset, $string, $len)
1591  {
1592  if ($len <= 0) {
1593  return '';
1594  }
1595  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1596  return mb_strcut($string, 0, $len, $charset);
1597  } elseif ($charset === 'utf-8') {
1598  return $this->utf8_strtrunc($string, $len);
1599  } elseif ($this->eucBasedSets[$charset]) {
1600  return $this->euc_strtrunc($string, $len, $charset);
1601  } elseif ($this->twoByteSets[$charset]) {
1602  if ($len % 2) {
1603  $len--;
1604  }
1605  } elseif ($this->fourByteSets[$charset]) {
1606  $x = $len % 4;
1607  // Realign to position dividable by four
1608  $len -= $x;
1609  }
1610  // Treat everything else as single-byte encoding
1611  return substr($string, 0, $len);
1612  }
1613 
1628  public function conv_case($charset, $string, $case)
1629  {
1630  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1631  if ($case === 'toLower') {
1632  $string = mb_strtolower($string, $charset);
1633  } else {
1634  $string = mb_strtoupper($string, $charset);
1635  }
1636  } elseif ($charset === 'utf-8') {
1637  $string = $this->utf8_char_mapping($string, 'case', $case);
1638  } elseif (isset($this->eucBasedSets[$charset])) {
1639  $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1640  } else {
1641  // Treat everything else as single-byte encoding
1642  $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1643  }
1644  return $string;
1645  }
1646 
1656  public function convCaseFirst($charset, $string, $case)
1657  {
1658  $firstChar = $this->substr($charset, $string, 0, 1);
1659  $firstChar = $this->conv_case($charset, $firstChar, $case);
1660  $remainder = $this->substr($charset, $string, 1);
1661  return $firstChar . $remainder;
1662  }
1663 
1671  public function convCapitalize($charset, $string)
1672  {
1673  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1674  return mb_convert_case($string, MB_CASE_TITLE, $charset);
1675  } else {
1676  return ucwords($string);
1677  }
1678  }
1679 
1687  public function specCharsToASCII($charset, $string)
1688  {
1689  if ($charset === 'utf-8') {
1690  $string = $this->utf8_char_mapping($string, 'ascii');
1691  } elseif (isset($this->eucBasedSets[$charset])) {
1692  $string = $this->euc_char_mapping($string, $charset, 'ascii');
1693  } else {
1694  // Treat everything else as single-byte encoding
1695  $string = $this->sb_char_mapping($string, $charset, 'ascii');
1696  }
1697  return $string;
1698  }
1699 
1707  public function getPreferredClientLanguage($languageCodesList)
1708  {
1709  $allLanguageCodes = $this->getAllLanguageCodes();
1710  $selectedLanguage = 'default';
1711  $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1712  // Order the preferred languages after they key
1713  $sortedPreferredLanguages = [];
1714  foreach ($preferredLanguages as $preferredLanguage) {
1715  $quality = 1.0;
1716  if (strpos($preferredLanguage, ';q=') !== false) {
1717  list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1718  }
1719  $sortedPreferredLanguages[$preferredLanguage] = $quality;
1720  }
1721  // Loop through the languages, with the highest priority first
1722  arsort($sortedPreferredLanguages, SORT_NUMERIC);
1723  foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1724  if (isset($allLanguageCodes[$preferredLanguage])) {
1725  $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1726  break;
1727  }
1728  // Strip the country code from the end
1729  list($preferredLanguage, ) = explode('-', $preferredLanguage);
1730  if (isset($allLanguageCodes[$preferredLanguage])) {
1731  $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1732  break;
1733  }
1734  }
1735  if (!$selectedLanguage || $selectedLanguage === 'en') {
1736  $selectedLanguage = 'default';
1737  }
1738  return $selectedLanguage;
1739  }
1740 
1746  protected function getAllLanguageCodes()
1747  {
1748  // Get all languages where TYPO3 code is the same as the ISO code
1749  $typo3LanguageCodes = array_keys($this->charSetArray);
1750  $allLanguageCodes = array_combine($typo3LanguageCodes, $typo3LanguageCodes);
1751  // Get all languages where TYPO3 code differs from ISO code
1752  // or needs the country part
1753  // the iso codes will here overwrite the default typo3 language in the key
1755  $locales = GeneralUtility::makeInstance(Locales::class);
1756  foreach ($locales->getIsoMapping() as $typo3Lang => $isoLang) {
1757  $isoLang = implode('-', explode('_', $isoLang));
1758  $allLanguageCodes[$typo3Lang] = $isoLang;
1759  }
1760  // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1761  $allLanguageCodes = array_flip($allLanguageCodes);
1762  // We need to add the default language (English)
1763  $allLanguageCodes['en'] = 'default';
1764  return $allLanguageCodes;
1765  }
1766 
1767  /********************************************
1768  *
1769  * Internal string operation functions
1770  *
1771  ********************************************/
1781  public function sb_char_mapping($str, $charset, $mode, $opt = '')
1782  {
1783  switch ($mode) {
1784  case 'case':
1785  if (!$this->initCaseFolding($charset)) {
1786  return $str;
1787  }
1788  // Do nothing
1789  $map = &$this->caseFolding[$charset][$opt];
1790  break;
1791  case 'ascii':
1792  if (!$this->initToASCII($charset)) {
1793  return $str;
1794  }
1795  // Do nothing
1796  $map = &$this->toASCII[$charset];
1797  break;
1798  default:
1799  return $str;
1800  }
1801  $out = '';
1802  for ($i = 0; isset($str[$i]); $i++) {
1803  $c = $str[$i];
1804  if (isset($map[$c])) {
1805  $out .= $map[$c];
1806  } else {
1807  $out .= $c;
1808  }
1809  }
1810  return $out;
1811  }
1812 
1813  /********************************************
1814  *
1815  * Internal UTF-8 string operation functions
1816  *
1817  ********************************************/
1828  public function utf8_substr($str, $start, $len = null)
1829  {
1830  if ((string)$len === '0') {
1831  return '';
1832  }
1833  $byte_start = $this->utf8_char2byte_pos($str, $start);
1834  if ($byte_start === false) {
1835  if ($start > 0) {
1836  // $start outside string length
1837  return false;
1838  }
1839  }
1840  $str = substr($str, $byte_start);
1841  if ($len != null) {
1842  $byte_end = $this->utf8_char2byte_pos($str, $len);
1843  // $len outside actual string length
1844  if ($byte_end === false) {
1845  return $len < 0 ? '' : $str;
1846  } else {
1847  // When length is less than zero and exceeds, then we return blank string.
1848  return substr($str, 0, $byte_end);
1849  }
1850  } else {
1851  return $str;
1852  }
1853  }
1854 
1863  public function utf8_strlen($str)
1864  {
1865  $n = 0;
1866  for ($i = 0; isset($str[$i]); $i++) {
1867  $c = ord($str[$i]);
1868  // Single-byte (0xxxxxx)
1869  if (!($c & 128)) {
1870  $n++;
1871  } elseif (($c & 192) === 192) {
1872  // Multi-byte starting byte (11xxxxxx)
1873  $n++;
1874  }
1875  }
1876  return $n;
1877  }
1878 
1887  public function utf8_strtrunc($str, $len)
1888  {
1889  $i = $len - 1;
1890  // Part of a multibyte sequence
1891  if (ord($str[$i]) & 128) {
1892  for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1893  }
1894  if ($i <= 0) {
1895  return '';
1896  }
1897  // Sanity check
1898  for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1899  // Calculate number of bytes
1900  $bc++;
1901  }
1902  if ($bc + $i > $len) {
1903  return substr($str, 0, $i);
1904  }
1905  }
1906  return substr($str, 0, $len);
1907  }
1908 
1918  public function utf8_strpos($haystack, $needle, $offset = 0)
1919  {
1920  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1921  return mb_strpos($haystack, $needle, $offset, 'utf-8');
1922  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1923  return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1924  }
1925  $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1926  if ($byte_offset === false) {
1927  // Offset beyond string length
1928  return false;
1929  }
1930  $byte_pos = strpos($haystack, $needle, $byte_offset);
1931  if ($byte_pos === false) {
1932  // Needle not found
1933  return false;
1934  }
1935  return $this->utf8_byte2char_pos($haystack, $byte_pos);
1936  }
1937 
1946  public function utf8_strrpos($haystack, $needle)
1947  {
1948  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1949  return mb_strrpos($haystack, $needle, 'utf-8');
1950  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1951  return iconv_strrpos($haystack, $needle, 'utf-8');
1952  }
1953  $byte_pos = strrpos($haystack, $needle);
1954  if ($byte_pos === false) {
1955  // Needle not found
1956  return false;
1957  }
1958  return $this->utf8_byte2char_pos($haystack, $byte_pos);
1959  }
1960 
1969  public function utf8_char2byte_pos($str, $pos)
1970  {
1971  // Number of characters found
1972  $n = 0;
1973  // Number of characters wanted
1974  $p = abs($pos);
1975  if ($pos >= 0) {
1976  $i = 0;
1977  $d = 1;
1978  } else {
1979  $i = strlen($str) - 1;
1980  $d = -1;
1981  }
1982  for (; isset($str[$i]) && $n < $p; $i += $d) {
1983  $c = (int)ord($str[$i]);
1984  // single-byte (0xxxxxx)
1985  if (!($c & 128)) {
1986  $n++;
1987  } elseif (($c & 192) === 192) {
1988  // Multi-byte starting byte (11xxxxxx)
1989  $n++;
1990  }
1991  }
1992  if (!isset($str[$i])) {
1993  // Offset beyond string length
1994  return false;
1995  }
1996  if ($pos >= 0) {
1997  // Skip trailing multi-byte data bytes
1998  while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1999  $i++;
2000  }
2001  } else {
2002  // Correct offset
2003  $i++;
2004  }
2005  return $i;
2006  }
2007 
2016  public function utf8_byte2char_pos($str, $pos)
2017  {
2018  // Number of characters
2019  $n = 0;
2020  for ($i = $pos; $i > 0; $i--) {
2021  $c = (int)ord($str[$i]);
2022  // single-byte (0xxxxxx)
2023  if (!($c & 128)) {
2024  $n++;
2025  } elseif (($c & 192) === 192) {
2026  // Multi-byte starting byte (11xxxxxx)
2027  $n++;
2028  }
2029  }
2030  if (!isset($str[$i])) {
2031  // Offset beyond string length
2032  return false;
2033  }
2034  return $n;
2035  }
2036 
2045  public function utf8_char_mapping($str, $mode, $opt = '')
2046  {
2047  if (!$this->initUnicodeData($mode)) {
2048  // Do nothing
2049  return $str;
2050  }
2051  $out = '';
2052  switch ($mode) {
2053  case 'case':
2054  $map = &$this->caseFolding['utf-8'][$opt];
2055  break;
2056  case 'ascii':
2057  $map = &$this->toASCII['utf-8'];
2058  break;
2059  default:
2060  return $str;
2061  }
2062  for ($i = 0; isset($str[$i]); $i++) {
2063  $c = ord($str[$i]);
2064  // single-byte (0xxxxxx)
2065  if (!($c & 128)) {
2066  $mbc = $str[$i];
2067  } elseif (($c & 192) === 192) {
2068  // multi-byte starting byte (11xxxxxx)
2069  for ($bc = 0; $c & 128; $c = $c << 1) {
2070  $bc++;
2071  }
2072  // calculate number of bytes
2073  $mbc = substr($str, $i, $bc);
2074  $i += $bc - 1;
2075  }
2076  if (isset($map[$mbc])) {
2077  $out .= $map[$mbc];
2078  } else {
2079  $out .= $mbc;
2080  }
2081  }
2082  return $out;
2083  }
2084 
2085  /********************************************
2086  *
2087  * Internal EUC string operation functions
2088  *
2089  * Extended Unix Code:
2090  * ASCII compatible 7bit single bytes chars
2091  * 8bit two byte chars
2092  *
2093  * Shift-JIS is treated as a special case.
2094  *
2095  ********************************************/
2105  public function euc_strtrunc($str, $len, $charset)
2106  {
2107  $shiftJis = $charset === 'shift_jis';
2108  for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
2109  $c = ord($str[$i]);
2110  if ($shiftJis) {
2111  if ($c >= 128 && $c < 160 || $c >= 224) {
2112  $i++;
2113  }
2114  } else {
2115  if ($c >= 128) {
2116  $i++;
2117  }
2118  }
2119  }
2120  if (!isset($str[$i])) {
2121  return $str;
2122  }
2123  // string shorter than supplied length
2124  if ($i > $len) {
2125  // We ended on a first byte
2126  return substr($str, 0, $len - 1);
2127  } else {
2128  return substr($str, 0, $len);
2129  }
2130  }
2131 
2141  public function euc_substr($str, $start, $charset, $len = null)
2142  {
2143  $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2144  if ($byte_start === false) {
2145  // $start outside string length
2146  return false;
2147  }
2148  $str = substr($str, $byte_start);
2149  if ($len != null) {
2150  $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2151  // $len outside actual string length
2152  if ($byte_end === false) {
2153  return $str;
2154  } else {
2155  return substr($str, 0, $byte_end);
2156  }
2157  } else {
2158  return $str;
2159  }
2160  }
2161 
2170  public function euc_strlen($str, $charset)
2171  {
2172  $sjis = $charset === 'shift_jis';
2173  $n = 0;
2174  for ($i = 0; isset($str[$i]); $i++) {
2175  $c = ord($str[$i]);
2176  if ($sjis) {
2177  if ($c >= 128 && $c < 160 || $c >= 224) {
2178  $i++;
2179  }
2180  } else {
2181  if ($c >= 128) {
2182  $i++;
2183  }
2184  }
2185  $n++;
2186  }
2187  return $n;
2188  }
2189 
2198  public function euc_char2byte_pos($str, $pos, $charset)
2199  {
2200  $sjis = $charset === 'shift_jis';
2201  // Number of characters seen
2202  $n = 0;
2203  // Number of characters wanted
2204  $p = abs($pos);
2205  if ($pos >= 0) {
2206  $i = 0;
2207  $d = 1;
2208  } else {
2209  $i = strlen($str) - 1;
2210  $d = -1;
2211  }
2212  for (; isset($str[$i]) && $n < $p; $i += $d) {
2213  $c = ord($str[$i]);
2214  if ($sjis) {
2215  if ($c >= 128 && $c < 160 || $c >= 224) {
2216  $i += $d;
2217  }
2218  } else {
2219  if ($c >= 128) {
2220  $i += $d;
2221  }
2222  }
2223  $n++;
2224  }
2225  if (!isset($str[$i])) {
2226  return false;
2227  }
2228  // offset beyond string length
2229  if ($pos < 0) {
2230  $i++;
2231  }
2232  // correct offset
2233  return $i;
2234  }
2235 
2245  public function euc_char_mapping($str, $charset, $mode, $opt = '')
2246  {
2247  switch ($mode) {
2248  case 'case':
2249  if (!$this->initCaseFolding($charset)) {
2250  return $str;
2251  }
2252  // do nothing
2253  $map = &$this->caseFolding[$charset][$opt];
2254  break;
2255  case 'ascii':
2256  if (!$this->initToASCII($charset)) {
2257  return $str;
2258  }
2259  // do nothing
2260  $map = &$this->toASCII[$charset];
2261  break;
2262  default:
2263  return $str;
2264  }
2265  $sjis = $charset === 'shift_jis';
2266  $out = '';
2267  for ($i = 0; isset($str[$i]); $i++) {
2268  $mbc = $str[$i];
2269  $c = ord($mbc);
2270  if ($sjis) {
2271  // A double-byte char
2272  if ($c >= 128 && $c < 160 || $c >= 224) {
2273  $mbc = substr($str, $i, 2);
2274  $i++;
2275  }
2276  } else {
2277  // A double-byte char
2278  if ($c >= 128) {
2279  $mbc = substr($str, $i, 2);
2280  $i++;
2281  }
2282  }
2283  if (isset($map[$mbc])) {
2284  $out .= $map[$mbc];
2285  } else {
2286  $out .= $mbc;
2287  }
2288  }
2289  return $out;
2290  }
2291 }
utf8_strpos($haystack, $needle, $offset=0)
convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar=false)
euc_char_mapping($str, $charset, $mode, $opt='')
static writeFileToTypo3tempDir($filepath, $content)
utf8_to_numberarray($str, $convEntities=false, $retChar=false)
static trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
crop($charset, $string, $len, $crop='')
entities_to_utf8($str, $alsoStdHtmlEnt=false)
euc_substr($str, $start, $charset, $len=null)
sb_char_mapping($str, $charset, $mode, $opt='')
substr($charset, $string, $start, $len=null)
static getUrl($url, $includeHeader=0, $requestHeaders=false, &$report=null)
static getFileAbsFileName($filename, $onlyRelative=true, $relToTYPO3_mainDir=false)
conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar=false)
utf8_decode($str, $charset, $useEntityForNoChar=false)
if(TYPO3_MODE==='BE') $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tsfebeuserauth.php']['frontendEditingController']['default']
cropMbstring($charset, $string, $len, $crop='')
$locales
Definition: be_users.php:6