35 'csObj' =>
'Using $csObj within Indexing is discouraged, the property will be removed in TYPO3 v10.0 - if needed instantiate CharsetConverter yourself.',
67 'printjoins' => [46, 45, 95, 58, 47, 39],
68 'casesensitive' =>
false,
79 $this->csObj = GeneralUtility::makeInstance(CharsetConverter::class);
92 $this->debugString =
'';
94 if (!$this->lexerConf[
'casesensitive']) {
95 $wordString = mb_strtolower($wordString,
'utf-8');
102 $this->debugString =
'';
104 list($start, $len) = $this->
get_word($wordString, $pos);
106 $this->
addWords($words, $wordString, $start, $len);
108 $this->debugString .=
'<span style="color:red">' . htmlspecialchars(substr(
112 )) .
'</span>' . htmlspecialchars(substr($wordString, $start, $len));
114 $pos = $start + $len;
136 public function addWords(&$words, &$wordString, $start, $len)
139 $theWord = substr($wordString, $start, $len);
156 if ($cType ===
'cjk') {
158 $strlen = mb_strlen($theWord,
'utf-8');
160 for ($a = 0; $a < $strlen; $a++) {
161 if ($strlen == 1 || $a < $strlen - 1) {
162 $words[] = mb_substr($theWord, $a, 2,
'utf-8');
168 $charsetConverter = GeneralUtility::makeInstance(CharsetConverter::class);
169 foreach ($this->lexerConf[
'removeChars'] as $skipJoin) {
170 $theWord = str_replace($charsetConverter->UnumberToChar($skipJoin),
'', $theWord);
184 public function get_word(&$str, $pos = 0)
193 if ($str[$pos] ==
'') {
215 $cType = ($cType_prev =
false);
219 if ($str[$pos] ==
'') {
230 || $cType_prev ===
'cjk' && ($cType ===
'num' || $cType ===
'alpha')
231 || $cType ===
'cjk' && ($cType_prev ===
'num' || $cType_prev ===
'alpha')
234 if (!in_array($cp, $this->lexerConf[
'printjoins'])) {
237 $len = $printJoinLgd;
242 if (!$printJoinLgd) {
243 $printJoinLgd = $len;
249 } elseif (!$letter && $cType) {
256 if ($str[$pos] ==
'') {
264 $cType_prev = $cType;
286 if ($cp >= 48 && $cp <= 57) {
290 if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
296 if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
310 public function utf8_ord(&$str, &$len, $pos = 0, $hex =
false)
312 $ord = ord($str[$pos]);
315 for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
320 $ord = $ord & (1 << 6 - $bc) - 1;
323 for ($i = $pos + 1; $bc; $bc--, $i++) {
324 $ord = $ord << 6 | ord($str[$i]) & 63;
327 return $hex ?
'x' . dechex($ord) : $ord;