TYPO3 CMS  TYPO3_6-2
CharsetConverter.php
Go to the documentation of this file.
1 <?php
3 
18 
55 
59  protected $locales;
60 
61  // ASCII Value for chars with no equivalent.
65  public $noCharByteVal = 63;
66 
67  // This is the array where parsed conversion tables are stored (cached)
71  public $parsedCharsets = array();
72 
73  // An array where case folding data will be stored (cached)
77  public $caseFolding = array();
78 
79  // An array where charset-to-ASCII mappings are stored (cached)
83  public $toASCII = array();
84 
85  // This tells the converter which charsets has two bytes per char:
89  public $twoByteSets = array(
90  'ucs-2' => 1
91  );
92 
93  // This tells the converter which charsets has four bytes per char:
97  public $fourByteSets = array(
98  'ucs-4' => 1,
99  // 4-byte Unicode
100  'utf-32' => 1
101  );
102 
103  // This tells the converter which charsets use a scheme like the Extended Unix Code:
107  public $eucBasedSets = array(
108  'gb2312' => 1,
109  // Chinese, simplified.
110  'big5' => 1,
111  // Chinese, traditional.
112  'euc-kr' => 1,
113  // Korean
114  'shift_jis' => 1
115  );
116 
117  // See http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
118  // http://czyborra.com/charsets/iso8859.html
122  public $synonyms = array(
123  'us' => 'ascii',
124  'us-ascii' => 'ascii',
125  'cp819' => 'iso-8859-1',
126  'ibm819' => 'iso-8859-1',
127  'iso-ir-100' => 'iso-8859-1',
128  'iso-ir-101' => 'iso-8859-2',
129  'iso-ir-109' => 'iso-8859-3',
130  'iso-ir-110' => 'iso-8859-4',
131  'iso-ir-144' => 'iso-8859-5',
132  'iso-ir-127' => 'iso-8859-6',
133  'iso-ir-126' => 'iso-8859-7',
134  'iso-ir-138' => 'iso-8859-8',
135  'iso-ir-148' => 'iso-8859-9',
136  'iso-ir-157' => 'iso-8859-10',
137  'iso-ir-179' => 'iso-8859-13',
138  'iso-ir-199' => 'iso-8859-14',
139  'iso-ir-203' => 'iso-8859-15',
140  'csisolatin1' => 'iso-8859-1',
141  'csisolatin2' => 'iso-8859-2',
142  'csisolatin3' => 'iso-8859-3',
143  'csisolatin5' => 'iso-8859-9',
144  'csisolatin8' => 'iso-8859-14',
145  'csisolatin9' => 'iso-8859-15',
146  'csisolatingreek' => 'iso-8859-7',
147  'iso-celtic' => 'iso-8859-14',
148  'latin1' => 'iso-8859-1',
149  'latin2' => 'iso-8859-2',
150  'latin3' => 'iso-8859-3',
151  'latin5' => 'iso-8859-9',
152  'latin6' => 'iso-8859-10',
153  'latin8' => 'iso-8859-14',
154  'latin9' => 'iso-8859-15',
155  'l1' => 'iso-8859-1',
156  'l2' => 'iso-8859-2',
157  'l3' => 'iso-8859-3',
158  'l5' => 'iso-8859-9',
159  'l6' => 'iso-8859-10',
160  'l8' => 'iso-8859-14',
161  'l9' => 'iso-8859-15',
162  'cyrillic' => 'iso-8859-5',
163  'arabic' => 'iso-8859-6',
164  'tis-620' => 'iso-8859-11',
165  'win874' => 'windows-874',
166  'win1250' => 'windows-1250',
167  'win1251' => 'windows-1251',
168  'win1252' => 'windows-1252',
169  'win1253' => 'windows-1253',
170  'win1254' => 'windows-1254',
171  'win1255' => 'windows-1255',
172  'win1256' => 'windows-1256',
173  'win1257' => 'windows-1257',
174  'win1258' => 'windows-1258',
175  'cp1250' => 'windows-1250',
176  'cp1251' => 'windows-1251',
177  'cp1252' => 'windows-1252',
178  'ms-ee' => 'windows-1250',
179  'ms-ansi' => 'windows-1252',
180  'ms-greek' => 'windows-1253',
181  'ms-turk' => 'windows-1254',
182  'winbaltrim' => 'windows-1257',
183  'koi-8ru' => 'koi-8r',
184  'koi8r' => 'koi-8r',
185  'cp878' => 'koi-8r',
186  'mac' => 'macroman',
187  'macintosh' => 'macroman',
188  'euc-cn' => 'gb2312',
189  'x-euc-cn' => 'gb2312',
190  'euccn' => 'gb2312',
191  'cp936' => 'gb2312',
192  'big-5' => 'big5',
193  'cp950' => 'big5',
194  'eucjp' => 'euc-jp',
195  'sjis' => 'shift_jis',
196  'shift-jis' => 'shift_jis',
197  'cp932' => 'shift_jis',
198  'cp949' => 'euc-kr',
199  'utf7' => 'utf-7',
200  'utf8' => 'utf-8',
201  'utf16' => 'utf-16',
202  'utf32' => 'utf-32',
203  'utf8' => 'utf-8',
204  'ucs2' => 'ucs-2',
205  'ucs4' => 'ucs-4'
206  );
207 
208  // Mapping of iso-639-1 language codes to script names
212  public $lang_to_script = array(
213  // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
214  'af' => 'west_european',
215  //Afrikaans
216  'ar' => 'arabic',
217  'bg' => 'cyrillic',
218  // Bulgarian
219  'bs' => 'east_european',
220  // Bosnian
221  'cs' => 'east_european',
222  // Czech
223  'da' => 'west_european',
224  // Danish
225  'de' => 'west_european',
226  // German
227  'es' => 'west_european',
228  // Spanish
229  'et' => 'estonian',
230  'eo' => 'unicode',
231  // Esperanto
232  'eu' => 'west_european',
233  // Basque
234  'fa' => 'arabic',
235  // Persian
236  'fi' => 'west_european',
237  // Finish
238  'fo' => 'west_european',
239  // Faroese
240  'fr' => 'west_european',
241  // French
242  'ga' => 'west_european',
243  // Irish
244  'gl' => 'west_european',
245  // Galician
246  'gr' => 'greek',
247  'he' => 'hebrew',
248  // Hebrew (since 1998)
249  'hi' => 'unicode',
250  // Hindi
251  'hr' => 'east_european',
252  // Croatian
253  'hu' => 'east_european',
254  // Hungarian
255  'iw' => 'hebrew',
256  // Hebrew (til 1998)
257  'is' => 'west_european',
258  // Icelandic
259  'it' => 'west_european',
260  // Italian
261  'ja' => 'japanese',
262  'ka' => 'unicode',
263  // Georgian
264  'kl' => 'west_european',
265  // Greenlandic
266  'km' => 'unicode',
267  // Khmer
268  'ko' => 'korean',
269  'lt' => 'lithuanian',
270  'lv' => 'west_european',
271  // Latvian/Lettish
272  'nl' => 'west_european',
273  // Dutch
274  'no' => 'west_european',
275  // Norwegian
276  'nb' => 'west_european',
277  // Norwegian Bokmal
278  'nn' => 'west_european',
279  // Norwegian Nynorsk
280  'pl' => 'east_european',
281  // Polish
282  'pt' => 'west_european',
283  // Portuguese
284  'ro' => 'east_european',
285  // Romanian
286  'ru' => 'cyrillic',
287  // Russian
288  'sk' => 'east_european',
289  // Slovak
290  'sl' => 'east_european',
291  // Slovenian
292  'sr' => 'cyrillic',
293  // Serbian
294  'sv' => 'west_european',
295  // Swedish
296  'sq' => 'albanian',
297  // Albanian
298  'th' => 'thai',
299  'uk' => 'cyrillic',
300  // Ukranian
301  'vi' => 'vietnamese',
302  'zh' => 'chinese',
303  // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
304  // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
305  'afk' => 'west_european',
306  // Afrikaans
307  'ara' => 'arabic',
308  'bgr' => 'cyrillic',
309  // Bulgarian
310  'cat' => 'west_european',
311  // Catalan
312  'chs' => 'simpl_chinese',
313  'cht' => 'trad_chinese',
314  'csy' => 'east_european',
315  // Czech
316  'dan' => 'west_european',
317  // Danisch
318  'deu' => 'west_european',
319  // German
320  'dea' => 'west_european',
321  // German (Austrian)
322  'des' => 'west_european',
323  // German (Swiss)
324  'ena' => 'west_european',
325  // English (Australian)
326  'enc' => 'west_european',
327  // English (Canadian)
328  'eng' => 'west_european',
329  // English
330  'enz' => 'west_european',
331  // English (New Zealand)
332  'enu' => 'west_european',
333  // English (United States)
334  'euq' => 'west_european',
335  // Basque
336  'fos' => 'west_european',
337  // Faroese
338  'far' => 'arabic',
339  // Persian
340  'fin' => 'west_european',
341  // Finish
342  'fra' => 'west_european',
343  // French
344  'frb' => 'west_european',
345  // French (Belgian)
346  'frc' => 'west_european',
347  // French (Canadian)
348  'frs' => 'west_european',
349  // French (Swiss)
350  'geo' => 'unicode',
351  // Georgian
352  'glg' => 'west_european',
353  // Galician
354  'ell' => 'greek',
355  'heb' => 'hebrew',
356  'hin' => 'unicode',
357  // Hindi
358  'hun' => 'east_european',
359  // Hungarian
360  'isl' => 'west_european',
361  // Icelandic
362  'ita' => 'west_european',
363  // Italian
364  'its' => 'west_european',
365  // Italian (Swiss)
366  'jpn' => 'japanese',
367  'khm' => 'unicode',
368  // Khmer
369  'kor' => 'korean',
370  'lth' => 'lithuanian',
371  'lvi' => 'west_european',
372  // Latvian/Lettish
373  'msl' => 'west_european',
374  // Malay
375  'nlb' => 'west_european',
376  // Dutch (Belgian)
377  'nld' => 'west_european',
378  // Dutch
379  'nor' => 'west_european',
380  // Norwegian (bokmal)
381  'non' => 'west_european',
382  // Norwegian (nynorsk)
383  'plk' => 'east_european',
384  // Polish
385  'ptg' => 'west_european',
386  // Portuguese
387  'ptb' => 'west_european',
388  // Portuguese (Brazil)
389  'rom' => 'east_european',
390  // Romanian
391  'rus' => 'cyrillic',
392  // Russian
393  'slv' => 'east_european',
394  // Slovenian
395  'sky' => 'east_european',
396  // Slovak
397  'srl' => 'east_european',
398  // Serbian (Latin)
399  'srb' => 'cyrillic',
400  // Serbian (Cyrillic)
401  'esp' => 'west_european',
402  // Spanish (trad. sort)
403  'esm' => 'west_european',
404  // Spanish (Mexican)
405  'esn' => 'west_european',
406  // Spanish (internat. sort)
407  'sve' => 'west_european',
408  // Swedish
409  'sqi' => 'albanian',
410  // Albanian
411  'tha' => 'thai',
412  'trk' => 'turkish',
413  'ukr' => 'cyrillic',
414  // Ukrainian
415  // English language names
416  'afrikaans' => 'west_european',
417  'albanian' => 'albanian',
418  'arabic' => 'arabic',
419  'basque' => 'west_european',
420  'bosnian' => 'east_european',
421  'bulgarian' => 'east_european',
422  'catalan' => 'west_european',
423  'croatian' => 'east_european',
424  'czech' => 'east_european',
425  'danish' => 'west_european',
426  'dutch' => 'west_european',
427  'english' => 'west_european',
428  'esperanto' => 'unicode',
429  'estonian' => 'estonian',
430  'faroese' => 'west_european',
431  'farsi' => 'arabic',
432  'finnish' => 'west_european',
433  'french' => 'west_european',
434  'galician' => 'west_european',
435  'georgian' => 'unicode',
436  'german' => 'west_european',
437  'greek' => 'greek',
438  'greenlandic' => 'west_european',
439  'hebrew' => 'hebrew',
440  'hindi' => 'unicode',
441  'hungarian' => 'east_european',
442  'icelandic' => 'west_european',
443  'italian' => 'west_european',
444  'khmer' => 'unicode',
445  'latvian' => 'west_european',
446  'lettish' => 'west_european',
447  'lithuanian' => 'lithuanian',
448  'malay' => 'west_european',
449  'norwegian' => 'west_european',
450  'persian' => 'arabic',
451  'polish' => 'east_european',
452  'portuguese' => 'west_european',
453  'russian' => 'cyrillic',
454  'romanian' => 'east_european',
455  'serbian' => 'cyrillic',
456  'slovak' => 'east_european',
457  'slovenian' => 'east_european',
458  'spanish' => 'west_european',
459  'svedish' => 'west_european',
460  'that' => 'thai',
461  'turkish' => 'turkish',
462  'ukrainian' => 'cyrillic'
463  );
464 
465  // Mapping of language (family) names to charsets on Unix
469  public $script_to_charset_unix = array(
470  'west_european' => 'iso-8859-1',
471  'estonian' => 'iso-8859-1',
472  'east_european' => 'iso-8859-2',
473  'baltic' => 'iso-8859-4',
474  'cyrillic' => 'iso-8859-5',
475  'arabic' => 'iso-8859-6',
476  'greek' => 'iso-8859-7',
477  'hebrew' => 'iso-8859-8',
478  'turkish' => 'iso-8859-9',
479  'thai' => 'iso-8859-11',
480  // = TIS-620
481  'lithuanian' => 'iso-8859-13',
482  'chinese' => 'gb2312',
483  // = euc-cn
484  'japanese' => 'euc-jp',
485  'korean' => 'euc-kr',
486  'simpl_chinese' => 'gb2312',
487  'trad_chinese' => 'big5',
488  'vietnamese' => '',
489  'unicode' => 'utf-8',
490  'albanian' => 'utf-8'
491  );
492 
493  // Mapping of language (family) names to charsets on Windows
498  'east_european' => 'windows-1250',
499  'cyrillic' => 'windows-1251',
500  'west_european' => 'windows-1252',
501  'greek' => 'windows-1253',
502  'turkish' => 'windows-1254',
503  'hebrew' => 'windows-1255',
504  'arabic' => 'windows-1256',
505  'baltic' => 'windows-1257',
506  'estonian' => 'windows-1257',
507  'lithuanian' => 'windows-1257',
508  'vietnamese' => 'windows-1258',
509  'thai' => 'cp874',
510  'korean' => 'cp949',
511  'chinese' => 'gb2312',
512  'japanese' => 'shift_jis',
513  'simpl_chinese' => 'gb2312',
514  'trad_chinese' => 'big5',
515  'albanian' => 'windows-1250',
516  'unicode' => 'utf-8'
517  );
518 
519  // Mapping of locale names to charsets
523  public $locale_to_charset = array(
524  'japanese.euc' => 'euc-jp',
525  'ja_jp.ujis' => 'euc-jp',
526  'korean.euc' => 'euc-kr',
527  'sr@Latn' => 'iso-8859-2',
528  'zh_cn' => 'gb2312',
529  'zh_hk' => 'big5',
530  'zh_tw' => 'big5'
531  );
532 
533  // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
534  // Empty values means "iso-8859-1"
538  public $charSetArray = array(
539  'af' => '',
540  'ar' => 'iso-8859-6',
541  'ba' => 'iso-8859-2',
542  'bg' => 'windows-1251',
543  'br' => '',
544  'ca' => 'iso-8859-15',
545  'ch' => 'gb2312',
546  'cs' => 'windows-1250',
547  'cz' => 'windows-1250',
548  'da' => '',
549  'de' => '',
550  'dk' => '',
551  'el' => 'iso-8859-7',
552  'eo' => 'utf-8',
553  'es' => '',
554  'et' => 'iso-8859-4',
555  'eu' => '',
556  'fa' => 'utf-8',
557  'fi' => '',
558  'fo' => 'utf-8',
559  'fr' => '',
560  'fr_CA' => '',
561  'ga' => '',
562  'ge' => 'utf-8',
563  'gl' => '',
564  'gr' => 'iso-8859-7',
565  'he' => 'utf-8',
566  'hi' => 'utf-8',
567  'hk' => 'big5',
568  'hr' => 'windows-1250',
569  'hu' => 'iso-8859-2',
570  'is' => 'utf-8',
571  'it' => '',
572  'ja' => 'shift_jis',
573  'jp' => 'shift_jis',
574  'ka' => 'utf-8',
575  'kl' => 'utf-8',
576  'km' => 'utf-8',
577  'ko' => 'euc-kr',
578  'kr' => 'euc-kr',
579  'lt' => 'windows-1257',
580  'lv' => 'utf-8',
581  'ms' => '',
582  'my' => '',
583  'nl' => '',
584  'no' => '',
585  'pl' => 'iso-8859-2',
586  'pt' => '',
587  'pt_BR' => '',
588  'qc' => '',
589  'ro' => 'iso-8859-2',
590  'ru' => 'windows-1251',
591  'se' => '',
592  'si' => 'windows-1250',
593  'sk' => 'windows-1250',
594  'sl' => 'windows-1250',
595  'sq' => 'utf-8',
596  'sr' => 'utf-8',
597  'sv' => '',
598  'th' => 'iso-8859-11',
599  'tr' => 'iso-8859-9',
600  'ua' => 'windows-1251',
601  'uk' => 'windows-1251',
602  'vi' => 'utf-8',
603  'vn' => 'utf-8',
604  'zh' => 'big5'
605  );
606 
610  public function __construct() {
611  $this->locales = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Localization\\Locales');
612  }
613 
621  public function parse_charset($charset) {
622  $charset = trim(strtolower($charset));
623  if (isset($this->synonyms[$charset])) {
624  $charset = $this->synonyms[$charset];
625  }
626  return $charset;
627  }
628 
641  public function get_locale_charset($locale) {
642  $locale = strtolower($locale);
643  // Exact locale specific charset?
644  if (isset($this->locale_to_charset[$locale])) {
645  return $this->locale_to_charset[$locale];
646  }
647  // Get modifier
648  list($locale, $modifier) = explode('@', $locale);
649  // Locale contains charset: use it
650  list($locale, $charset) = explode('.', $locale);
651  if ($charset) {
652  return $this->parse_charset($charset);
653  }
654  // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
655  if ($modifier == 'euro') {
656  return 'iso-8859-15';
657  }
658  // Get language
659  list($language, $country) = explode('_', $locale);
660  if (isset($this->lang_to_script[$language])) {
661  $script = $this->lang_to_script[$language];
662  }
663  if (TYPO3_OS == 'WIN') {
664  $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
665  } else {
666  $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
667  }
668  return $cs;
669  }
670 
671  /********************************************
672  *
673  * Charset Conversion functions
674  *
675  ********************************************/
687  public function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
688  if ($fromCS == $toCS) {
689  return $str;
690  }
691  // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
692  if ($toCS == 'utf-8' || !$useEntityForNoChar) {
693  switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
694  case 'mbstring':
695  $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
696  if (FALSE !== $conv_str) {
697  return $conv_str;
698  }
699  // Returns FALSE for unsupported charsets
700  break;
701  case 'iconv':
702  $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
703  if (FALSE !== $conv_str) {
704  return $conv_str;
705  }
706  break;
707  case 'recode':
708  $conv_str = recode_string($fromCS . '..' . $toCS, $str);
709  if (FALSE !== $conv_str) {
710  return $conv_str;
711  }
712  break;
713  }
714  }
715  if ($fromCS != 'utf-8') {
716  $str = $this->utf8_encode($str, $fromCS);
717  }
718  if ($toCS != 'utf-8') {
719  $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
720  }
721  return $str;
722  }
723 
736  public function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
737  foreach ($array as $key => $value) {
738  if (is_array($array[$key])) {
739  $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
740  } elseif (is_string($array[$key])) {
741  $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
742  }
743  }
744  }
745 
754  public function utf8_encode($str, $charset) {
755  if ($charset === 'utf-8') {
756  return $str;
757  }
758  // Charset is case-insensitive
759  // Parse conv. table if not already
760  if ($this->initCharset($charset)) {
761  $strLen = strlen($str);
762  $outStr = '';
763  // Traverse each char in string
764  for ($a = 0; $a < $strLen; $a++) {
765  $chr = substr($str, $a, 1);
766  $ord = ord($chr);
767  // If the charset has two bytes per char
768  if (isset($this->twoByteSets[$charset])) {
769  $ord2 = ord($str[$a + 1]);
770  // Assume big endian
771  $ord = $ord << 8 | $ord2;
772  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
773  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
774  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
775  } else {
776  $outStr .= chr($this->noCharByteVal);
777  }
778  // No char exists
779  $a++;
780  } elseif ($ord > 127) {
781  // If char has value over 127 it's a multibyte char in UTF-8
782  // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
783  if (isset($this->eucBasedSets[$charset])) {
784  // Shift-JIS: chars between 160 and 223 are single byte
785  if ($charset != 'shift_jis' || ($ord < 160 || $ord > 223)) {
786  $a++;
787  $ord2 = ord(substr($str, $a, 1));
788  $ord = $ord * 256 + $ord2;
789  }
790  }
791  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
792  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
793  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
794  } else {
795  $outStr .= chr($this->noCharByteVal);
796  }
797  } else {
798  $outStr .= $chr;
799  }
800  }
801  return $outStr;
802  }
803  }
804 
814  public function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
815  if ($charset === 'utf-8') {
816  return $str;
817  }
818  // Charset is case-insensitive.
819  // Parse conv. table if not already
820  if ($this->initCharset($charset)) {
821  $strLen = strlen($str);
822  $outStr = '';
823  $buf = '';
824  // Traverse each char in UTF-8 string
825  for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
826  $chr = substr($str, $a, 1);
827  $ord = ord($chr);
828  // This means multibyte! (first byte!)
829  if ($ord > 127) {
830  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
831  if ($ord & 64) {
832  // Add first byte
833  $buf = $chr;
834  // For each byte in multibyte string
835  for ($b = 0; $b < 8; $b++) {
836  // Shift it left and
837  $ord = $ord << 1;
838  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
839  if ($ord & 128) {
840  $a++;
841  // ... and add the next char.
842  $buf .= substr($str, $a, 1);
843  } else {
844  break;
845  }
846  }
847  // If the UTF-8 char-sequence is found then...
848  if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
849  // The local number
850  $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
851  // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
852  if ($mByte > 255) {
853  $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
854  } else {
855  $outStr .= chr($mByte);
856  }
857  } elseif ($useEntityForNoChar) {
858  // Create num entity:
859  $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
860  } else {
861  $outStr .= chr($this->noCharByteVal);
862  }
863  } else {
864  $outStr .= chr($this->noCharByteVal);
865  }
866  } else {
867  $outStr .= $chr;
868  }
869  }
870  return $outStr;
871  }
872  }
873 
881  public function utf8_to_entities($str) {
882  $strLen = strlen($str);
883  $outStr = '';
884  $buf = '';
885  // Traverse each char in UTF-8 string.
886  for ($a = 0; $a < $strLen; $a++) {
887  $chr = substr($str, $a, 1);
888  $ord = ord($chr);
889  // This means multibyte! (first byte!)
890  if ($ord > 127) {
891  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
892  if ($ord & 64) {
893  // Add first byte
894  $buf = $chr;
895  // For each byte in multibyte string...
896  for ($b = 0; $b < 8; $b++) {
897  // Shift it left and ...
898  $ord = $ord << 1;
899  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
900  if ($ord & 128) {
901  $a++;
902  // ... and add the next char.
903  $buf .= substr($str, $a, 1);
904  } else {
905  break;
906  }
907  }
908  $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
909  } else {
910  $outStr .= chr($this->noCharByteVal);
911  }
912  } else {
913  $outStr .= $chr;
914  }
915  }
916  return $outStr;
917  }
918 
927  public function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
928  if ($alsoStdHtmlEnt) {
929  $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
930  }
931  $token = md5(microtime());
932  $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
933  foreach ($parts as $k => $v) {
934  // Only take every second element
935  if ($k % 2 === 0) {
936  continue;
937  }
938  $position = 0;
939  // Dec or hex entities
940  if (substr($v, $position, 1) == '#') {
941  $position++;
942  if (substr($v, $position, 1) == 'x') {
943  $v = hexdec(substr($v, ++$position));
944  } else {
945  $v = substr($v, $position);
946  }
947  $parts[$k] = $this->UnumberToChar($v);
948  } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
949  // Other entities:
950  $v = $trans_tbl['&' . $v . ';'];
951  $parts[$k] = $v;
952  } else {
953  // No conversion:
954  $parts[$k] = '&' . $v . ';';
955  }
956  }
957  return implode('', $parts);
958  }
959 
969  public function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
970  // If entities must be registered as well...:
971  if ($convEntities) {
972  $str = $this->entities_to_utf8($str, 1);
973  }
974  // Do conversion:
975  $strLen = strlen($str);
976  $outArr = array();
977  $buf = '';
978  // Traverse each char in UTF-8 string.
979  for ($a = 0; $a < $strLen; $a++) {
980  $chr = substr($str, $a, 1);
981  $ord = ord($chr);
982  // This means multibyte! (first byte!)
983  if ($ord > 127) {
984  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
985  if ($ord & 64) {
986  // Add first byte
987  $buf = $chr;
988  // For each byte in multibyte string...
989  for ($b = 0; $b < 8; $b++) {
990  // Shift it left and ...
991  $ord = $ord << 1;
992  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
993  if ($ord & 128) {
994  $a++;
995  // ... and add the next char.
996  $buf .= substr($str, $a, 1);
997  } else {
998  break;
999  }
1000  }
1001  $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
1002  } else {
1003  $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
1004  }
1005  } else {
1006  $outArr[] = $retChar ? chr($ord) : $ord;
1007  }
1008  }
1009  return $outArr;
1010  }
1011 
1033  public function UnumberToChar($cbyte) {
1034  $str = '';
1035  if ($cbyte < 128) {
1036  $str .= chr($cbyte);
1037  } else {
1038  if ($cbyte < 2048) {
1039  $str .= chr(192 | $cbyte >> 6);
1040  $str .= chr(128 | $cbyte & 63);
1041  } else {
1042  if ($cbyte < 65536) {
1043  $str .= chr(224 | $cbyte >> 12);
1044  $str .= chr(128 | $cbyte >> 6 & 63);
1045  $str .= chr(128 | $cbyte & 63);
1046  } else {
1047  if ($cbyte < 2097152) {
1048  $str .= chr(240 | $cbyte >> 18);
1049  $str .= chr(128 | $cbyte >> 12 & 63);
1050  $str .= chr(128 | $cbyte >> 6 & 63);
1051  $str .= chr(128 | $cbyte & 63);
1052  } else {
1053  if ($cbyte < 67108864) {
1054  $str .= chr(248 | $cbyte >> 24);
1055  $str .= chr(128 | $cbyte >> 18 & 63);
1056  $str .= chr(128 | $cbyte >> 12 & 63);
1057  $str .= chr(128 | $cbyte >> 6 & 63);
1058  $str .= chr(128 | $cbyte & 63);
1059  } else {
1060  if ($cbyte < 2147483648) {
1061  $str .= chr(252 | $cbyte >> 30);
1062  $str .= chr(128 | $cbyte >> 24 & 63);
1063  $str .= chr(128 | $cbyte >> 18 & 63);
1064  $str .= chr(128 | $cbyte >> 12 & 63);
1065  $str .= chr(128 | $cbyte >> 6 & 63);
1066  $str .= chr(128 | $cbyte & 63);
1067  } else {
1068  // Cannot express a 32-bit character in UTF-8
1069  $str .= chr($this->noCharByteVal);
1070  }
1071  }
1072  }
1073  }
1074  }
1075  }
1076  return $str;
1077  }
1078 
1089  public function utf8CharToUnumber($str, $hex = 0) {
1090  // First char
1091  $ord = ord($str[0]);
1092  // This verifyes that it IS a multi byte string
1093  if (($ord & 192) == 192) {
1094  $binBuf = '';
1095  // For each byte in multibyte string...
1096  for ($b = 0; $b < 8; $b++) {
1097  // Shift it left and ...
1098  $ord = $ord << 1;
1099  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1100  if ($ord & 128) {
1101  $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1102  } else {
1103  break;
1104  }
1105  }
1106  $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1107  $int = bindec($binBuf);
1108  } else {
1109  $int = $ord;
1110  }
1111  return $hex ? 'x' . dechex($int) : $int;
1112  }
1113 
1114  /********************************************
1115  *
1116  * Init functions
1117  *
1118  ********************************************/
1130  public function initCharset($charset) {
1131  // Only process if the charset is not yet loaded:
1132  if (!is_array($this->parsedCharsets[$charset])) {
1133  // Conversion table filename:
1134  $charsetConvTableFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1135  // If the conversion table is found:
1136  if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1137  // Cache file for charsets:
1138  // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1139  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1140  if ($cacheFile && @is_file($cacheFile)) {
1141  $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1142  } else {
1143  // Parse conversion table into lines:
1144  $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), TRUE);
1145  // Initialize the internal variable holding the conv. table:
1146  $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1147  // traverse the lines:
1148  $detectedType = '';
1149  foreach ($lines as $value) {
1150  // Comment line or blanks are ignored.
1151  if (trim($value) && $value[0] !== '#') {
1152  // Detect type if not done yet: (Done on first real line)
1153  // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1154  if (!$detectedType) {
1155  $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1156  }
1157  if ($detectedType == 'ms-token') {
1158  list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1159  } elseif ($detectedType == 'whitespaced') {
1160  $regA = array();
1161  preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1162  $hexbyte = $regA[1];
1163  $utf8 = 'U+' . $regA[2];
1164  }
1165  $decval = hexdec(trim($hexbyte));
1166  if ($decval > 127) {
1167  $utf8decval = hexdec(substr(trim($utf8), 2));
1168  $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1169  $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1170  }
1171  }
1172  }
1173  if ($cacheFile) {
1174  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1175  }
1176  }
1177  return 2;
1178  } else {
1179  return FALSE;
1180  }
1181  } else {
1182  return 1;
1183  }
1184  }
1185 
1196  public function initUnicodeData($mode = NULL) {
1197  // Cache files
1198  $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1199  $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1200  // Only process if the tables are not yet loaded
1201  switch ($mode) {
1202  case 'case':
1203  if (is_array($this->caseFolding['utf-8'])) {
1204  return 1;
1205  }
1206  // Use cached version if possible
1207  if ($cacheFileCase && @is_file($cacheFileCase)) {
1208  $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1209  return 2;
1210  }
1211  break;
1212  case 'ascii':
1213  if (is_array($this->toASCII['utf-8'])) {
1214  return 1;
1215  }
1216  // Use cached version if possible
1217  if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1218  $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1219  return 2;
1220  }
1221  break;
1222  }
1223  // Process main Unicode data file
1224  $unicodeDataFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1225  if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1226  return FALSE;
1227  }
1228  $fh = fopen($unicodeDataFile, 'rb');
1229  if (!$fh) {
1230  return FALSE;
1231  }
1232  // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1233  // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1234  $this->caseFolding['utf-8'] = array();
1235  $utf8CaseFolding = &$this->caseFolding['utf-8'];
1236  // a shorthand
1237  $utf8CaseFolding['toUpper'] = array();
1238  $utf8CaseFolding['toLower'] = array();
1239  $utf8CaseFolding['toTitle'] = array();
1240  // Array of temp. decompositions
1241  $decomposition = array();
1242  // Array of chars that are marks (eg. composing accents)
1243  $mark = array();
1244  // Array of chars that are numbers (eg. digits)
1245  $number = array();
1246  // Array of chars to be omitted (eg. Russian hard sign)
1247  $omit = array();
1248  while (!feof($fh)) {
1249  $line = fgets($fh, 4096);
1250  // Has a lot of info
1251  list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1252  $ord = hexdec($char);
1253  if ($ord > 65535) {
1254  // Only process the BMP
1255  break;
1256  }
1257  $utf8_char = $this->UnumberToChar($ord);
1258  if ($upper) {
1259  $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1260  }
1261  if ($lower) {
1262  $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1263  }
1264  // Store "title" only when different from "upper" (only a few)
1265  if ($title && $title != $upper) {
1266  $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1267  }
1268  switch ($cat[0]) {
1269  case 'M':
1270  // mark (accent, umlaut, ...)
1271  $mark['U+' . $char] = 1;
1272  break;
1273  case 'N':
1274  // numeric value
1275  if ($ord > 128 && $num != '') {
1276  $number['U+' . $char] = $num;
1277  }
1278  }
1279  // Accented Latin letters without "official" decomposition
1280  $match = array();
1281  if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1282  $c = ord($match[2]);
1283  if ($match[1] == 'SMALL') {
1284  $c += 32;
1285  }
1286  $decomposition['U+' . $char] = array(dechex($c));
1287  continue;
1288  }
1289  $match = array();
1290  if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1291  switch ($match[1]) {
1292  case '<circle>':
1293  // add parenthesis as circle replacement, eg (1)
1294  $match[2] = '0028 ' . $match[2] . ' 0029';
1295  break;
1296  case '<square>':
1297  // add square brackets as square replacement, eg [1]
1298  $match[2] = '005B ' . $match[2] . ' 005D';
1299  break;
1300  case '<compat>':
1301  // ignore multi char decompositions that start with a space
1302  if (preg_match('/^0020 /', $match[2])) {
1303  continue 2;
1304  }
1305  break;
1306  case '<initial>':
1307 
1308  case '<medial>':
1309 
1310  case '<final>':
1311 
1312  case '<isolated>':
1313 
1314  case '<vertical>':
1315  continue 2;
1316  }
1317  $decomposition['U+' . $char] = explode(' ', $match[2]);
1318  }
1319  }
1320  fclose($fh);
1321  // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1322  $specialCasingFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1323  if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1324  $fh = fopen($specialCasingFile, 'rb');
1325  if ($fh) {
1326  while (!feof($fh)) {
1327  $line = fgets($fh, 4096);
1328  if ($line[0] != '#' && trim($line) != '') {
1329  list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1330  if ($cond == '' || $cond[0] == '#') {
1331  $utf8_char = $this->UnumberToChar(hexdec($char));
1332  if ($char != $lower) {
1333  $arr = explode(' ', $lower);
1334  for ($i = 0; isset($arr[$i]); $i++) {
1335  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1336  }
1337  $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1338  }
1339  if ($char != $title && $title != $upper) {
1340  $arr = explode(' ', $title);
1341  for ($i = 0; isset($arr[$i]); $i++) {
1342  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1343  }
1344  $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1345  }
1346  if ($char != $upper) {
1347  $arr = explode(' ', $upper);
1348  for ($i = 0; isset($arr[$i]); $i++) {
1349  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1350  }
1351  $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1352  }
1353  }
1354  }
1355  }
1356  fclose($fh);
1357  }
1358  }
1359  // Process custom decompositions
1360  $customTranslitFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1361  if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1362  $fh = fopen($customTranslitFile, 'rb');
1363  if ($fh) {
1364  while (!feof($fh)) {
1365  $line = fgets($fh, 4096);
1366  if ($line[0] != '#' && trim($line) != '') {
1367  list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1368  if (!$translit) {
1369  $omit['U+' . $char] = 1;
1370  }
1371  $decomposition['U+' . $char] = explode(' ', $translit);
1372  }
1373  }
1374  fclose($fh);
1375  }
1376  }
1377  // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1378  foreach ($decomposition as $from => $to) {
1379  $code_decomp = array();
1380  while ($code_value = array_shift($to)) {
1381  // Do recursive decomposition
1382  if (isset($decomposition['U+' . $code_value])) {
1383  foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1384  array_unshift($to, $cv);
1385  }
1386  } elseif (!isset($mark[('U+' . $code_value)])) {
1387  // remove mark
1388  array_push($code_decomp, $code_value);
1389  }
1390  }
1391  if (count($code_decomp) || isset($omit[$from])) {
1392  $decomposition[$from] = $code_decomp;
1393  } else {
1394  unset($decomposition[$from]);
1395  }
1396  }
1397  // Create ascii only mapping
1398  $this->toASCII['utf-8'] = array();
1399  $ascii = &$this->toASCII['utf-8'];
1400  foreach ($decomposition as $from => $to) {
1401  $code_decomp = array();
1402  while ($code_value = array_shift($to)) {
1403  $ord = hexdec($code_value);
1404  if ($ord > 127) {
1405  continue 2;
1406  } else {
1407  // Skip decompositions containing non-ASCII chars
1408  array_push($code_decomp, chr($ord));
1409  }
1410  }
1411  $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1412  }
1413  // Add numeric decompositions
1414  foreach ($number as $from => $to) {
1415  $utf8_char = $this->UnumberToChar(hexdec($from));
1416  if (!isset($ascii[$utf8_char])) {
1417  $ascii[$utf8_char] = $to;
1418  }
1419  }
1420  if ($cacheFileCase) {
1421  GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1422  }
1423  if ($cacheFileASCII) {
1424  GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1425  }
1426  return 3;
1427  }
1428 
1438  public function initCaseFolding($charset) {
1439  // Only process if the case table is not yet loaded:
1440  if (is_array($this->caseFolding[$charset])) {
1441  return 1;
1442  }
1443  // Use cached version if possible
1444  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1445  if ($cacheFile && @is_file($cacheFile)) {
1446  $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1447  return 2;
1448  }
1449  // init UTF-8 conversion for this charset
1450  if (!$this->initCharset($charset)) {
1451  return FALSE;
1452  }
1453  // UTF-8 case folding is used as the base conversion table
1454  if (!$this->initUnicodeData('case')) {
1455  return FALSE;
1456  }
1457  $nochar = chr($this->noCharByteVal);
1458  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1459  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1460  $c = $this->utf8_decode($utf8, $charset);
1461  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1462  if ($cc != '' && $cc != $nochar) {
1463  $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1464  }
1465  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1466  if ($cc != '' && $cc != $nochar) {
1467  $this->caseFolding[$charset]['toLower'][$c] = $cc;
1468  }
1469  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1470  if ($cc != '' && $cc != $nochar) {
1471  $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1472  }
1473  }
1474  // Add the ASCII case table
1475  $start = ord('a');
1476  $end = ord('z');
1477  for ($i = $start; $i <= $end; $i++) {
1478  $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1479  }
1480  $start = ord('A');
1481  $end = ord('Z');
1482  for ($i = $start; $i <= $end; $i++) {
1483  $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1484  }
1485  if ($cacheFile) {
1486  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1487  }
1488  return 3;
1489  }
1490 
1500  public function initToASCII($charset) {
1501  // Only process if the case table is not yet loaded:
1502  if (is_array($this->toASCII[$charset])) {
1503  return 1;
1504  }
1505  // Use cached version if possible
1506  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1507  if ($cacheFile && @is_file($cacheFile)) {
1508  $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1509  return 2;
1510  }
1511  // Init UTF-8 conversion for this charset
1512  if (!$this->initCharset($charset)) {
1513  return FALSE;
1514  }
1515  // UTF-8/ASCII transliteration is used as the base conversion table
1516  if (!$this->initUnicodeData('ascii')) {
1517  return FALSE;
1518  }
1519  $nochar = chr($this->noCharByteVal);
1520  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1521  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1522  $c = $this->utf8_decode($utf8, $charset);
1523  if (isset($this->toASCII['utf-8'][$utf8])) {
1524  $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1525  }
1526  }
1527  if ($cacheFile) {
1528  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1529  }
1530  return 3;
1531  }
1532 
1533  /********************************************
1534  *
1535  * String operation functions
1536  *
1537  ********************************************/
1550  public function substr($charset, $string, $start, $len = NULL) {
1551  if ($len === 0 || $string === '') {
1552  return '';
1553  }
1554  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1555  // Cannot omit $len, when specifying charset
1556  if ($len == NULL) {
1557  // Save internal encoding
1558  $enc = mb_internal_encoding();
1559  mb_internal_encoding($charset);
1560  $str = mb_substr($string, $start);
1561  // Restore internal encoding
1562  mb_internal_encoding($enc);
1563  return $str;
1564  } else {
1565  return mb_substr($string, $start, $len, $charset);
1566  }
1567  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1568  // Cannot omit $len, when specifying charset
1569  if ($len == NULL) {
1570  // Save internal encoding
1571  $enc = iconv_get_encoding('internal_encoding');
1572  iconv_set_encoding('internal_encoding', $charset);
1573  $str = iconv_substr($string, $start);
1574  // Restore internal encoding
1575  iconv_set_encoding('internal_encoding', $enc);
1576  return $str;
1577  } else {
1578  return iconv_substr($string, $start, $len, $charset);
1579  }
1580  } elseif ($charset == 'utf-8') {
1581  return $this->utf8_substr($string, $start, $len);
1582  } elseif ($this->eucBasedSets[$charset]) {
1583  return $this->euc_substr($string, $start, $charset, $len);
1584  } elseif ($this->twoByteSets[$charset]) {
1585  return substr($string, $start * 2, $len * 2);
1586  } elseif ($this->fourByteSets[$charset]) {
1587  return substr($string, $start * 4, $len * 4);
1588  }
1589  // Treat everything else as single-byte encoding
1590  return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1591  }
1592 
1603  public function strlen($charset, $string) {
1604  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1605  return mb_strlen($string, $charset);
1606  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1607  return iconv_strlen($string, $charset);
1608  } elseif ($charset == 'utf-8') {
1609  return $this->utf8_strlen($string);
1610  } elseif ($this->eucBasedSets[$charset]) {
1611  return $this->euc_strlen($string, $charset);
1612  } elseif ($this->twoByteSets[$charset]) {
1613  return strlen($string) / 2;
1614  } elseif ($this->fourByteSets[$charset]) {
1615  return strlen($string) / 4;
1616  }
1617  // Treat everything else as single-byte encoding
1618  return strlen($string);
1619  }
1620 
1631  protected function cropMbstring($charset, $string, $len, $crop = '') {
1632  if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1633  return $string;
1634  }
1635  if ($len > 0) {
1636  $string = mb_substr($string, 0, $len, $charset) . $crop;
1637  } else {
1638  $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1639  }
1640  return $string;
1641  }
1642 
1655  public function crop($charset, $string, $len, $crop = '') {
1656  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1657  return $this->cropMbstring($charset, $string, $len, $crop);
1658  }
1659  if ((int)$len === 0) {
1660  return $string;
1661  }
1662  if ($charset == 'utf-8') {
1663  $i = $this->utf8_char2byte_pos($string, $len);
1664  } elseif ($this->eucBasedSets[$charset]) {
1665  $i = $this->euc_char2byte_pos($string, $len, $charset);
1666  } else {
1667  if ($len > 0) {
1668  $i = $len;
1669  } else {
1670  $i = strlen($string) + $len;
1671  if ($i <= 0) {
1672  $i = FALSE;
1673  }
1674  }
1675  }
1676  // $len outside actual string length
1677  if ($i === FALSE) {
1678  return $string;
1679  } else {
1680  if ($len > 0) {
1681  if (strlen($string[$i])) {
1682  return substr($string, 0, $i) . $crop;
1683  }
1684  } else {
1685  if (strlen($string[$i - 1])) {
1686  return $crop . substr($string, $i);
1687  }
1688  }
1689  }
1690  return $string;
1691  }
1692 
1703  public function strtrunc($charset, $string, $len) {
1704  if ($len <= 0) {
1705  return '';
1706  }
1707  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1708  return mb_strcut($string, 0, $len, $charset);
1709  } elseif ($charset == 'utf-8') {
1710  return $this->utf8_strtrunc($string, $len);
1711  } elseif ($this->eucBasedSets[$charset]) {
1712  return $this->euc_strtrunc($string, $len, $charset);
1713  } elseif ($this->twoByteSets[$charset]) {
1714  if ($len % 2) {
1715  $len--;
1716  }
1717  } elseif ($this->fourByteSets[$charset]) {
1718  $x = $len % 4;
1719  // Realign to position dividable by four
1720  $len -= $x;
1721  }
1722  // Treat everything else as single-byte encoding
1723  return substr($string, 0, $len);
1724  }
1725 
1741  public function conv_case($charset, $string, $case) {
1742  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1743  if ($case == 'toLower') {
1744  $string = mb_strtolower($string, $charset);
1745  } else {
1746  $string = mb_strtoupper($string, $charset);
1747  }
1748  } elseif ($charset == 'utf-8') {
1749  $string = $this->utf8_char_mapping($string, 'case', $case);
1750  } elseif (isset($this->eucBasedSets[$charset])) {
1751  $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1752  } else {
1753  // Treat everything else as single-byte encoding
1754  $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1755  }
1756  return $string;
1757  }
1758 
1768  public function convCaseFirst($charset, $string, $case) {
1769  $firstChar = $this->substr($charset, $string, 0, 1);
1770  $firstChar = $this->conv_case($charset, $firstChar, $case);
1771  $remainder = $this->substr($charset, $string, 1);
1772  return $firstChar . $remainder;
1773  }
1774 
1783  public function specCharsToASCII($charset, $string) {
1784  if ($charset === 'utf-8') {
1785  $string = $this->utf8_char_mapping($string, 'ascii');
1786  } elseif (isset($this->eucBasedSets[$charset])) {
1787  $string = $this->euc_char_mapping($string, $charset, 'ascii');
1788  } else {
1789  // Treat everything else as single-byte encoding
1790  $string = $this->sb_char_mapping($string, $charset, 'ascii');
1791  }
1792  return $string;
1793  }
1794 
1802  public function getPreferredClientLanguage($languageCodesList) {
1803  $allLanguageCodes = array();
1804  $selectedLanguage = 'default';
1805  // Get all languages where TYPO3 code is the same as the ISO code
1806  foreach ($this->charSetArray as $typo3Lang => $charSet) {
1807  $allLanguageCodes[$typo3Lang] = $typo3Lang;
1808  }
1809  // Get all languages where TYPO3 code differs from ISO code
1810  // or needs the country part
1811  // the iso codes will here overwrite the default typo3 language in the key
1812  foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1813  $isoLang = join('-', explode('_', $isoLang));
1814  $allLanguageCodes[$typo3Lang] = $isoLang;
1815  }
1816  // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1817  $allLanguageCodes = array_flip($allLanguageCodes);
1818  $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1819  // Order the preferred languages after they key
1820  $sortedPreferredLanguages = array();
1821  foreach ($preferredLanguages as $preferredLanguage) {
1822  $quality = 1.0;
1823  if (strpos($preferredLanguage, ';q=') !== FALSE) {
1824  list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1825  }
1826  $sortedPreferredLanguages[$preferredLanguage] = $quality;
1827  }
1828  // Loop through the languages, with the highest priority first
1829  arsort($sortedPreferredLanguages, SORT_NUMERIC);
1830  foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1831  if (isset($allLanguageCodes[$preferredLanguage])) {
1832  $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1833  break;
1834  }
1835  // Strip the country code from the end
1836  list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1837  if (isset($allLanguageCodes[$preferredLanguage])) {
1838  $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1839  break;
1840  }
1841  }
1842  if (!$selectedLanguage || $selectedLanguage == 'en') {
1843  $selectedLanguage = 'default';
1844  }
1845  return $selectedLanguage;
1846  }
1847 
1848  /********************************************
1849  *
1850  * Internal string operation functions
1851  *
1852  ********************************************/
1863  public function sb_char_mapping($str, $charset, $mode, $opt = '') {
1864  switch ($mode) {
1865  case 'case':
1866  if (!$this->initCaseFolding($charset)) {
1867  return $str;
1868  }
1869  // Do nothing
1870  $map = &$this->caseFolding[$charset][$opt];
1871  break;
1872  case 'ascii':
1873  if (!$this->initToASCII($charset)) {
1874  return $str;
1875  }
1876  // Do nothing
1877  $map = &$this->toASCII[$charset];
1878  break;
1879  default:
1880  return $str;
1881  }
1882  $out = '';
1883  for ($i = 0; strlen($str[$i]); $i++) {
1884  $c = $str[$i];
1885  if (isset($map[$c])) {
1886  $out .= $map[$c];
1887  } else {
1888  $out .= $c;
1889  }
1890  }
1891  return $out;
1892  }
1893 
1894  /********************************************
1895  *
1896  * Internal UTF-8 string operation functions
1897  *
1898  ********************************************/
1910  public function utf8_substr($str, $start, $len = NULL) {
1911  if ((string)$len === '0') {
1912  return '';
1913  }
1914  $byte_start = $this->utf8_char2byte_pos($str, $start);
1915  if ($byte_start === FALSE) {
1916  if ($start > 0) {
1917  // $start outside string length
1918  return FALSE;
1919  } else {
1920  $start = 0;
1921  }
1922  }
1923  $str = substr($str, $byte_start);
1924  if ($len != NULL) {
1925  $byte_end = $this->utf8_char2byte_pos($str, $len);
1926  // $len outside actual string length
1927  if ($byte_end === FALSE) {
1928  return $len < 0 ? '' : $str;
1929  } else {
1930  // When length is less than zero and exceeds, then we return blank string.
1931  return substr($str, 0, $byte_end);
1932  }
1933  } else {
1934  return $str;
1935  }
1936  }
1937 
1947  public function utf8_strlen($str) {
1948  $n = 0;
1949  for ($i = 0; strlen($str[$i]); $i++) {
1950  $c = ord($str[$i]);
1951  // Single-byte (0xxxxxx)
1952  if (!($c & 128)) {
1953  $n++;
1954  } elseif (($c & 192) == 192) {
1955  // Multi-byte starting byte (11xxxxxx)
1956  $n++;
1957  }
1958  }
1959  return $n;
1960  }
1961 
1971  public function utf8_strtrunc($str, $len) {
1972  $i = $len - 1;
1973  // Part of a multibyte sequence
1974  if (ord($str[$i]) & 128) {
1975  for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1976 
1977  }
1978  if ($i <= 0) {
1979  return '';
1980  }
1981  // Sanity check
1982  for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1983  // Calculate number of bytes
1984  $bc++;
1985  }
1986  if ($bc + $i > $len) {
1987  return substr($str, 0, $i);
1988  }
1989  }
1990  return substr($str, 0, $len);
1991  }
1992 
2003  public function utf8_strpos($haystack, $needle, $offset = 0) {
2004  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2005  return mb_strpos($haystack, $needle, $offset, 'utf-8');
2006  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2007  return iconv_strpos($haystack, $needle, $offset, 'utf-8');
2008  }
2009  $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
2010  if ($byte_offset === FALSE) {
2011  // Offset beyond string length
2012  return FALSE;
2013  }
2014  $byte_pos = strpos($haystack, $needle, $byte_offset);
2015  if ($byte_pos === FALSE) {
2016  // Needle not found
2017  return FALSE;
2018  }
2019  return $this->utf8_byte2char_pos($haystack, $byte_pos);
2020  }
2021 
2031  public function utf8_strrpos($haystack, $needle) {
2032  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2033  return mb_strrpos($haystack, $needle, 'utf-8');
2034  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2035  return iconv_strrpos($haystack, $needle, 'utf-8');
2036  }
2037  $byte_pos = strrpos($haystack, $needle);
2038  if ($byte_pos === FALSE) {
2039  // Needle not found
2040  return FALSE;
2041  }
2042  return $this->utf8_byte2char_pos($haystack, $byte_pos);
2043  }
2044 
2054  public function utf8_char2byte_pos($str, $pos) {
2055  // Number of characters found
2056  $n = 0;
2057  // Number of characters wanted
2058  $p = abs($pos);
2059  if ($pos >= 0) {
2060  $i = 0;
2061  $d = 1;
2062  } else {
2063  $i = strlen($str) - 1;
2064  $d = -1;
2065  }
2066  for (; strlen($str[$i]) && $n < $p; $i += $d) {
2067  $c = (int)ord($str[$i]);
2068  // single-byte (0xxxxxx)
2069  if (!($c & 128)) {
2070  $n++;
2071  } elseif (($c & 192) == 192) {
2072  // Multi-byte starting byte (11xxxxxx)
2073  $n++;
2074  }
2075  }
2076  if (!strlen($str[$i])) {
2077  // Offset beyond string length
2078  return FALSE;
2079  }
2080  if ($pos >= 0) {
2081  // Skip trailing multi-byte data bytes
2082  while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
2083  $i++;
2084  }
2085  } else {
2086  // Correct offset
2087  $i++;
2088  }
2089  return $i;
2090  }
2091 
2101  public function utf8_byte2char_pos($str, $pos) {
2102  // Number of characters
2103  $n = 0;
2104  for ($i = $pos; $i > 0; $i--) {
2105  $c = (int)ord($str[$i]);
2106  // single-byte (0xxxxxx)
2107  if (!($c & 128)) {
2108  $n++;
2109  } elseif (($c & 192) == 192) {
2110  // Multi-byte starting byte (11xxxxxx)
2111  $n++;
2112  }
2113  }
2114  if (!strlen($str[$i])) {
2115  // Offset beyond string length
2116  return FALSE;
2117  }
2118  return $n;
2119  }
2120 
2130  public function utf8_char_mapping($str, $mode, $opt = '') {
2131  if (!$this->initUnicodeData($mode)) {
2132  // Do nothing
2133  return $str;
2134  }
2135  $out = '';
2136  switch ($mode) {
2137  case 'case':
2138  $map = &$this->caseFolding['utf-8'][$opt];
2139  break;
2140  case 'ascii':
2141  $map = &$this->toASCII['utf-8'];
2142  break;
2143  default:
2144  return $str;
2145  }
2146  for ($i = 0; strlen($str[$i]); $i++) {
2147  $c = ord($str[$i]);
2148  // single-byte (0xxxxxx)
2149  if (!($c & 128)) {
2150  $mbc = $str[$i];
2151  } elseif (($c & 192) == 192) {
2152  // multi-byte starting byte (11xxxxxx)
2153  for ($bc = 0; $c & 128; $c = $c << 1) {
2154  $bc++;
2155  }
2156  // calculate number of bytes
2157  $mbc = substr($str, $i, $bc);
2158  $i += $bc - 1;
2159  }
2160  if (isset($map[$mbc])) {
2161  $out .= $map[$mbc];
2162  } else {
2163  $out .= $mbc;
2164  }
2165  }
2166  return $out;
2167  }
2168 
2169  /********************************************
2170  *
2171  * Internal EUC string operation functions
2172  *
2173  * Extended Unix Code:
2174  * ASCII compatible 7bit single bytes chars
2175  * 8bit two byte chars
2176  *
2177  * Shift-JIS is treated as a special case.
2178  *
2179  ********************************************/
2190  public function euc_strtrunc($str, $len, $charset) {
2191  $sjis = $charset == 'shift_jis';
2192  for ($i = 0; strlen($str[$i]) && $i < $len; $i++) {
2193  $c = ord($str[$i]);
2194  if ($sjis) {
2195  if ($c >= 128 && $c < 160 || $c >= 224) {
2196  $i++;
2197  }
2198  } else {
2199  if ($c >= 128) {
2200  $i++;
2201  }
2202  }
2203  }
2204  if (!strlen($str[$i])) {
2205  return $str;
2206  }
2207  // string shorter than supplied length
2208  if ($i > $len) {
2209  // We ended on a first byte
2210  return substr($str, 0, $len - 1);
2211  } else {
2212  return substr($str, 0, $len);
2213  }
2214  }
2215 
2226  public function euc_substr($str, $start, $charset, $len = NULL) {
2227  $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2228  if ($byte_start === FALSE) {
2229  // $start outside string length
2230  return FALSE;
2231  }
2232  $str = substr($str, $byte_start);
2233  if ($len != NULL) {
2234  $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2235  // $len outside actual string length
2236  if ($byte_end === FALSE) {
2237  return $str;
2238  } else {
2239  return substr($str, 0, $byte_end);
2240  }
2241  } else {
2242  return $str;
2243  }
2244  }
2245 
2255  public function euc_strlen($str, $charset) {
2256  $sjis = $charset == 'shift_jis';
2257  $n = 0;
2258  for ($i = 0; strlen($str[$i]); $i++) {
2259  $c = ord($str[$i]);
2260  if ($sjis) {
2261  if ($c >= 128 && $c < 160 || $c >= 224) {
2262  $i++;
2263  }
2264  } else {
2265  if ($c >= 128) {
2266  $i++;
2267  }
2268  }
2269  $n++;
2270  }
2271  return $n;
2272  }
2273 
2283  public function euc_char2byte_pos($str, $pos, $charset) {
2284  $sjis = $charset == 'shift_jis';
2285  // Number of characters seen
2286  $n = 0;
2287  // Number of characters wanted
2288  $p = abs($pos);
2289  if ($pos >= 0) {
2290  $i = 0;
2291  $d = 1;
2292  } else {
2293  $i = strlen($str) - 1;
2294  $d = -1;
2295  }
2296  for (; strlen($str[$i]) && $n < $p; $i += $d) {
2297  $c = ord($str[$i]);
2298  if ($sjis) {
2299  if ($c >= 128 && $c < 160 || $c >= 224) {
2300  $i += $d;
2301  }
2302  } else {
2303  if ($c >= 128) {
2304  $i += $d;
2305  }
2306  }
2307  $n++;
2308  }
2309  if (!strlen($str[$i])) {
2310  return FALSE;
2311  }
2312  // offset beyond string length
2313  if ($pos < 0) {
2314  $i++;
2315  }
2316  // correct offset
2317  return $i;
2318  }
2319 
2330  public function euc_char_mapping($str, $charset, $mode, $opt = '') {
2331  switch ($mode) {
2332  case 'case':
2333  if (!$this->initCaseFolding($charset)) {
2334  return $str;
2335  }
2336  // do nothing
2337  $map = &$this->caseFolding[$charset][$opt];
2338  break;
2339  case 'ascii':
2340  if (!$this->initToASCII($charset)) {
2341  return $str;
2342  }
2343  // do nothing
2344  $map = &$this->toASCII[$charset];
2345  break;
2346  default:
2347  return $str;
2348  }
2349  $sjis = $charset == 'shift_jis';
2350  $out = '';
2351  for ($i = 0; strlen($str[$i]); $i++) {
2352  $mbc = $str[$i];
2353  $c = ord($mbc);
2354  if ($sjis) {
2355  // A double-byte char
2356  if ($c >= 128 && $c < 160 || $c >= 224) {
2357  $mbc = substr($str, $i, 2);
2358  $i++;
2359  }
2360  } else {
2361  // A double-byte char
2362  if ($c >= 128) {
2363  $mbc = substr($str, $i, 2);
2364  $i++;
2365  }
2366  }
2367  if (isset($map[$mbc])) {
2368  $out .= $map[$mbc];
2369  } else {
2370  $out .= $mbc;
2371  }
2372  }
2373  return $out;
2374  }
2375 
2376 }
utf8_strpos($haystack, $needle, $offset=0)
entities_to_utf8($str, $alsoStdHtmlEnt=FALSE)
euc_char_mapping($str, $charset, $mode, $opt='')
convArray(&$array, $fromCS, $toCS, $useEntityForNoChar=0)
static writeFileToTypo3tempDir($filepath, $content)
static trimExplode($delim, $string, $removeEmptyValues=FALSE, $limit=0)
crop($charset, $string, $len, $crop='')
utf8_decode($str, $charset, $useEntityForNoChar=0)
euc_substr($str, $start, $charset, $len=NULL)
static getUrl($url, $includeHeader=0, $requestHeaders=FALSE, &$report=NULL)
sb_char_mapping($str, $charset, $mode, $opt='')
conv($str, $fromCS, $toCS, $useEntityForNoChar=0)
substr($charset, $string, $start, $len=NULL)
if(!defined('TYPO3_MODE')) $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_userauth.php']['logoff_pre_processing'][]
utf8_to_numberarray($str, $convEntities=0, $retChar=0)
static getFileAbsFileName($filename, $onlyRelative=TRUE, $relToTYPO3_mainDir=FALSE)
cropMbstring($charset, $string, $len, $crop='')