62 'noCharByteVal' =>
'Using $noCharByteVal of class CharsetConverter from the outside is discouraged, as this only reflects a fixed constant.',
63 'parsedCharsets' =>
'Using $parsedCharsets of class CharsetConverter from the outside is discouraged, as this only reflects a local runtime cache.',
64 'toASCII' =>
'Using $toASCII of class CharsetConverter from the outside is discouraged, as this only reflects a local runtime cache.',
65 'twoByteSets' =>
'Using $twoByteSets of class CharsetConverter from the outside is discouraged.',
66 'eucBasedSets' =>
'Using $eucBasedSets of class CharsetConverter from the outside is discouraged.',
67 'synonyms' =>
'Using $synonyms of class CharsetConverter from the outside is discouraged, as this functionality will be removed in TYPO3 v10.0.',
121 'us-ascii' =>
'ascii',
122 'cp819' =>
'iso-8859-1',
123 'ibm819' =>
'iso-8859-1',
124 'iso-ir-100' =>
'iso-8859-1',
125 'iso-ir-101' =>
'iso-8859-2',
126 'iso-ir-109' =>
'iso-8859-3',
127 'iso-ir-110' =>
'iso-8859-4',
128 'iso-ir-144' =>
'iso-8859-5',
129 'iso-ir-127' =>
'iso-8859-6',
130 'iso-ir-126' =>
'iso-8859-7',
131 'iso-ir-138' =>
'iso-8859-8',
132 'iso-ir-148' =>
'iso-8859-9',
133 'iso-ir-157' =>
'iso-8859-10',
134 'iso-ir-179' =>
'iso-8859-13',
135 'iso-ir-199' =>
'iso-8859-14',
136 'iso-ir-203' =>
'iso-8859-15',
137 'csisolatin1' =>
'iso-8859-1',
138 'csisolatin2' =>
'iso-8859-2',
139 'csisolatin3' =>
'iso-8859-3',
140 'csisolatin5' =>
'iso-8859-9',
141 'csisolatin8' =>
'iso-8859-14',
142 'csisolatin9' =>
'iso-8859-15',
143 'csisolatingreek' =>
'iso-8859-7',
144 'iso-celtic' =>
'iso-8859-14',
145 'latin1' =>
'iso-8859-1',
146 'latin2' =>
'iso-8859-2',
147 'latin3' =>
'iso-8859-3',
148 'latin5' =>
'iso-8859-9',
149 'latin6' =>
'iso-8859-10',
150 'latin8' =>
'iso-8859-14',
151 'latin9' =>
'iso-8859-15',
152 'l1' =>
'iso-8859-1',
153 'l2' =>
'iso-8859-2',
154 'l3' =>
'iso-8859-3',
155 'l5' =>
'iso-8859-9',
156 'l6' =>
'iso-8859-10',
157 'l8' =>
'iso-8859-14',
158 'l9' =>
'iso-8859-15',
159 'cyrillic' =>
'iso-8859-5',
160 'arabic' =>
'iso-8859-6',
161 'tis-620' =>
'iso-8859-11',
162 'win874' =>
'windows-874',
163 'win1250' =>
'windows-1250',
164 'win1251' =>
'windows-1251',
165 'win1252' =>
'windows-1252',
166 'win1253' =>
'windows-1253',
167 'win1254' =>
'windows-1254',
168 'win1255' =>
'windows-1255',
169 'win1256' =>
'windows-1256',
170 'win1257' =>
'windows-1257',
171 'win1258' =>
'windows-1258',
172 'cp1250' =>
'windows-1250',
173 'cp1251' =>
'windows-1251',
174 'cp1252' =>
'windows-1252',
175 'ms-ee' =>
'windows-1250',
176 'ms-ansi' =>
'windows-1252',
177 'ms-greek' =>
'windows-1253',
178 'ms-turk' =>
'windows-1254',
179 'winbaltrim' =>
'windows-1257',
180 'koi-8ru' =>
'koi-8r',
184 'macintosh' =>
'macroman',
185 'euc-cn' =>
'gb2312',
186 'x-euc-cn' =>
'gb2312',
192 'sjis' =>
'shift_jis',
193 'shift-jis' =>
'shift_jis',
194 'cp932' =>
'shift_jis',
213 trigger_error(
'Method CharsetConverter->parse_charset() will be removed in TYPO3 v10.0. Use native mbstring functions directly.', E_USER_DEPRECATED);
214 $charset = trim(strtolower($charset));
215 if (isset($this->synonyms[$charset])) {
216 $charset = $this->synonyms[$charset];
236 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar =
null)
238 if ($fromCharset === $toCharset) {
241 if ($useEntityForNoChar ===
null) {
242 $useEntityForNoChar =
false;
244 if (!$useEntityForNoChar) {
245 trigger_error(
'Calling CharsetConverter->conv() without the necessity to convert the entities for unavailable characters is discouraged, and will not be possible via conv() anymore in TYPO3 v10.0. Use native mb_convert_encoding() directly, or set the 4th parameter of conv() to true.', E_USER_DEPRECATED);
249 if ($toCharset ===
'utf-8' || !$useEntityForNoChar) {
251 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
252 if (
false !== $convertedString) {
253 return $convertedString;
256 if ($fromCharset !==
'utf-8') {
257 $inputString = $this->
utf8_encode($inputString, $fromCharset);
259 if ($toCharset !==
'utf-8') {
260 $inputString = $this->
utf8_decode($inputString, $toCharset, $useEntityForNoChar);
276 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar =
false)
278 trigger_error(
'Method CharsetConverter->convArray() will be removed in TYPO3 v10.0. Use conv() directly and loop over the array in the callers code.', E_USER_DEPRECATED);
279 foreach ($array as $key => $value) {
280 if (is_array($array[$key])) {
281 $this->
convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
282 } elseif (is_string($array[$key])) {
283 $array[$key] = $this->
conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
297 if ($charset ===
'utf-8') {
303 $strLen = strlen($str);
306 for ($a = 0; $a < $strLen; $a++) {
307 $chr = substr($str, $a, 1);
310 if (isset($this->twoByteSets[$charset])) {
311 $ord2 = ord($str[$a + 1]);
313 $ord = $ord << 8 | $ord2;
315 if (isset($this->parsedCharsets[$charset][
'local'][$ord])) {
316 $outStr .= $this->parsedCharsets[$charset][
'local'][$ord];
318 $outStr .= chr($this->noCharByteVal);
322 } elseif ($ord > 127) {
325 if (isset($this->eucBasedSets[$charset])) {
327 if ($charset !==
'shift_jis' || ($ord < 160 || $ord > 223)) {
329 $ord2 = ord(substr($str, $a, 1));
330 $ord = $ord * 256 + $ord2;
333 if (isset($this->parsedCharsets[$charset][
'local'][$ord])) {
335 $outStr .= $this->parsedCharsets[$charset][
'local'][$ord];
337 $outStr .= chr($this->noCharByteVal);
356 public function utf8_decode($str, $charset, $useEntityForNoChar =
false)
358 if ($charset ===
'utf-8') {
364 $strLen = strlen($str);
367 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
368 $chr = substr($str, $a, 1);
377 for ($b = 0; $b < 8; $b++) {
384 $buf .= substr($str, $a, 1);
390 if (isset($this->parsedCharsets[$charset][
'utf8'][$buf])) {
392 $mByte = $this->parsedCharsets[$charset][
'utf8'][$buf];
395 $outStr .= chr($mByte >> 8 & 255) . chr($mByte & 255);
397 $outStr .= chr($mByte);
399 } elseif ($useEntityForNoChar) {
403 $outStr .= chr($this->noCharByteVal);
406 $outStr .= chr($this->noCharByteVal);
426 trigger_error(
'Method CharsetConverter->utf8_to_entities() will be removed in TYPO3 v10.0. Use native PHP functions instead.', E_USER_DEPRECATED);
427 $strLen = strlen($str);
430 for ($a = 0; $a < $strLen; $a++) {
431 $chr = substr($str, $a, 1);
440 for ($b = 0; $b < 8; $b++) {
447 $buf .= substr($str, $a, 1);
454 $outStr .= chr($this->noCharByteVal);
472 trigger_error(
'Method CharsetConverter->entities_to_utf8() will be removed in TYPO3 v10.0. Use native PHP function html_entity_decode() instead.', E_USER_DEPRECATED);
473 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
474 $token = md5(microtime());
475 $parts = explode($token, preg_replace(
'/(&([#[:alnum:]]*);)/', $token .
'${2}' . $token, $str));
476 foreach ($parts as $k => $v) {
483 if (substr($v, $position, 1) ===
'#') {
485 if (substr($v, $position, 1) ===
'x') {
486 $v = hexdec(substr($v, ++$position));
488 $v = substr($v, $position);
491 } elseif (isset($trans_tbl[
'&' . $v .
';'])) {
493 $v = $trans_tbl[
'&' . $v .
';'];
497 $parts[$k] =
'&' . $v .
';';
500 return implode(
'', $parts);
514 $str = html_entity_decode($str, ENT_COMPAT,
'utf-8');
517 $strLen = strlen($str);
520 for ($a = 0; $a < $strLen; $a++) {
521 $chr = substr($str, $a, 1);
530 for ($b = 0; $b < 8; $b++) {
537 $buf .= substr($str, $a, 1);
544 $outArr[] = chr($this->noCharByteVal);
547 $outArr[] = chr($ord);
576 if ($unicodeInteger < 128) {
577 $str .= chr($unicodeInteger);
578 } elseif ($unicodeInteger < 2048) {
579 $str .= chr(192 | $unicodeInteger >> 6);
580 $str .= chr(128 | $unicodeInteger & 63);
581 } elseif ($unicodeInteger < 65536) {
582 $str .= chr(224 | $unicodeInteger >> 12);
583 $str .= chr(128 | $unicodeInteger >> 6 & 63);
584 $str .= chr(128 | $unicodeInteger & 63);
585 } elseif ($unicodeInteger < 2097152) {
586 $str .= chr(240 | $unicodeInteger >> 18);
587 $str .= chr(128 | $unicodeInteger >> 12 & 63);
588 $str .= chr(128 | $unicodeInteger >> 6 & 63);
589 $str .= chr(128 | $unicodeInteger & 63);
590 } elseif ($unicodeInteger < 67108864) {
591 $str .= chr(248 | $unicodeInteger >> 24);
592 $str .= chr(128 | $unicodeInteger >> 18 & 63);
593 $str .= chr(128 | $unicodeInteger >> 12 & 63);
594 $str .= chr(128 | $unicodeInteger >> 6 & 63);
595 $str .= chr(128 | $unicodeInteger & 63);
596 } elseif ($unicodeInteger < 2147483648) {
597 $str .= chr(252 | $unicodeInteger >> 30);
598 $str .= chr(128 | $unicodeInteger >> 24 & 63);
599 $str .= chr(128 | $unicodeInteger >> 18 & 63);
600 $str .= chr(128 | $unicodeInteger >> 12 & 63);
601 $str .= chr(128 | $unicodeInteger >> 6 & 63);
602 $str .= chr(128 | $unicodeInteger & 63);
605 $str .= chr($this->noCharByteVal);
624 if (($ord & 192) === 192) {
628 for (; $b < 8; $b++) {
633 $binBuf .= substr(
'00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
638 $binBuf = substr(
'00000000' . decbin(ord($str[0])), -(6 - $b)) . $binBuf;
639 $int = bindec($binBuf);
643 return $hex ?
'x' . dechex($int) : $int;
664 if (empty($this->parsedCharsets[$charset])) {
668 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
672 if ($cacheFile && @is_file($cacheFile)) {
673 $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile), [
'allowed_classes' =>
false]);
676 $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile),
true);
678 $this->parsedCharsets[$charset] = [
'local' => [],
'utf8' => []];
681 foreach ($lines as $value) {
683 if (trim($value) && $value[0] !==
'#') {
686 if (!$detectedType) {
687 $detectedType = preg_match(
'/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ?
'whitespaced' :
'ms-token';
691 if ($detectedType ===
'ms-token') {
692 list($hexbyte, $utf8) = preg_split(
'/[=:]/', $value, 3);
693 } elseif ($detectedType ===
'whitespaced') {
695 preg_match(
'/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
697 $utf8 =
'U+' . $regA[2];
699 $decval = hexdec(trim($hexbyte));
701 $utf8decval = hexdec(substr(trim($utf8), 2));
702 $this->parsedCharsets[$charset][
'local'][$decval] = $this->
UnumberToChar($utf8decval);
703 $this->parsedCharsets[$charset][
'utf8'][$this->parsedCharsets[$charset][
'local'][$decval]] = $decval;
708 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
713 throw new UnknownCharsetException(sprintf(
'Unknown charset "%s"', $charset), 1508916031);
730 if (isset($this->toASCII[
'utf-8']) && is_array($this->toASCII[
'utf-8'])) {
734 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
735 $this->toASCII[
'utf-8'] = unserialize(file_get_contents($cacheFileASCII), [
'allowed_classes' =>
false]);
740 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
743 $fh = fopen($unicodeDataFile,
'rb');
756 $line = fgets($fh, 4096);
758 list($char, $name, $cat, , , $decomp, , , $num) = explode(
';', rtrim($line));
759 $ord = hexdec($char);
767 $mark[
'U+' . $char] = 1;
771 if ($ord > 128 && $num !==
'') {
772 $number[
'U+' . $char] = $num;
777 if (preg_match(
'/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
779 if ($match[1] ===
'SMALL') {
782 $decomposition[
'U+' . $char] = [dechex($c)];
786 if (preg_match(
'/(<.*>)? *(.+)/', $decomp, $match)) {
790 $match[2] =
'0028 ' . $match[2] .
' 0029';
794 $match[2] =
'005B ' . $match[2] .
' 005D';
798 if (preg_match(
'/^0020 /', $match[2])) {
809 $decomposition[
'U+' . $char] = explode(
' ', $match[2]);
815 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
816 $fh = fopen($customTranslitFile,
'rb');
819 $line = fgets($fh, 4096);
820 if ($line ===
false) {
823 if ($line[0] !==
'#' && trim($line) !==
'') {
824 list($char, $translit) = GeneralUtility::trimExplode(
';', $line);
826 $omit[
'U+' . $char] = 1;
828 $decomposition[
'U+' . $char] = explode(
' ', $translit);
835 foreach ($decomposition as $from => $to) {
837 while ($code_value = array_shift($to)) {
839 if (isset($decomposition[
'U+' . $code_value])) {
840 foreach (array_reverse($decomposition[
'U+' . $code_value]) as $cv) {
841 array_unshift($to, $cv);
843 } elseif (!isset($mark[
'U+' . $code_value])) {
845 $code_decomp[] = $code_value;
848 if (!empty($code_decomp) || isset($omit[$from])) {
849 $decomposition[$from] = $code_decomp;
851 unset($decomposition[$from]);
855 $this->toASCII[
'utf-8'] = [];
856 foreach ($decomposition as $from => $to) {
858 while ($code_value = array_shift($to)) {
859 $ord = hexdec($code_value);
864 $code_decomp[] = chr($ord);
866 $this->toASCII[
'utf-8'][$this->
UnumberToChar(hexdec(substr($from, 2)))] = implode(
'', $code_decomp);
869 foreach ($number as $from => $to) {
871 if (!isset($this->toASCII[
'utf-8'][$utf8_char])) {
872 $this->toASCII[
'utf-8'][$utf8_char] = $to;
875 if ($cacheFileASCII) {
876 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($this->toASCII[
'utf-8']));
891 if (isset($this->toASCII[$charset]) && is_array($this->toASCII[$charset])) {
896 if ($cacheFile && @is_file($cacheFile)) {
897 $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile), [
'allowed_classes' =>
false]);
908 foreach ($this->parsedCharsets[$charset][
'local'] as $ci => $utf8) {
911 if (isset($this->toASCII[
'utf-8'][$utf8])) {
912 $this->toASCII[$charset][$c] = $this->toASCII[
'utf-8'][$utf8];
916 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
939 public function crop($charset, $string, $len, $crop =
'')
941 trigger_error(
'Method CharsetConverter->crop() will be removed in TYPO3 v10.0. Use native PHP mbstring functions instead.', E_USER_DEPRECATED);
942 if ((
int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
946 $string = mb_substr($string, 0, $len, $charset) . $crop;
948 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
964 trigger_error(
'Method CharsetConverter->convCaseFirst() will be removed in TYPO3 v10.0. Use native PHP mbstring functions instead.', E_USER_DEPRECATED);
965 $firstChar = mb_substr($string, 0, 1, $charset);
966 $firstChar = $case ===
'toLower'
967 ? mb_strtolower($firstChar, $charset)
968 : mb_strtoupper($firstChar, $charset);
969 $remainder = mb_substr($string, 1,
null, $charset);
970 return $firstChar . $remainder;
982 if ($charset ===
'utf-8') {
984 } elseif (isset($this->eucBasedSets[$charset])) {
1011 $map = &$this->toASCII[$charset];
1013 for ($i = 0; isset($str[$i]); $i++) {
1015 if (isset($map[$c])) {
1041 trigger_error(
'Method CharsetConverter->utf8_char2byte_pos() will be removed in TYPO3 v10.0.', E_USER_DEPRECATED);
1050 $i = strlen($str) - 1;
1053 for (; isset($str[$i]) && $n < $p; $i += $d) {
1054 $c = (int)ord($str[$i]);
1058 } elseif (($c & 192) === 192) {
1063 if (!isset($str[$i])) {
1069 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1092 $map = &$this->toASCII[
'utf-8'];
1093 for ($i = 0; isset($str[$i]); $i++) {
1099 } elseif (($c & 192) === 192) {
1102 for (; $c & 128; $c = $c << 1) {
1106 $mbc = substr($str, $i, $bc);
1109 if (isset($map[$mbc])) {
1143 $map = &$this->toASCII[$charset];
1145 for ($i = 0; isset($str[$i]); $i++) {
1148 if ($charset ===
'shift_jis') {
1150 if ($c >= 128 && $c < 160 || $c >= 224) {
1151 $mbc = substr($str, $i, 2);
1157 $mbc = substr($str, $i, 2);
1161 if (isset($map[$mbc])) {