‪TYPO3CMS  11.5
Lexer.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
20 
26 class ‪Lexer
27 {
28  protected const ‪CHARTYPE_NUMBER = 'num';
29  protected const ‪CHARTYPE_ALPHA = 'alpha';
30  // CJK (Chinese / Japanese / Korean)
31  protected const ‪CHARTYPE_CJK = 'cjk';
32 
38  public ‪$debug = false;
39 
45  public ‪$debugString = '';
46 
52  public ‪$lexerConf = [
53  'printjoins' => [
54  46, // .
55  45, // -
56  95, // _
57  58, // :
58  47, // /
59  39, // '
60  ],
61  'casesensitive' => false, // Set, if case-sensitive indexing is wanted
62  'removeChars' => [],
63  ];
64 
72  public function ‪split2Words($wordString)
73  {
74  // Reset debug string:
75  $this->debugString = '';
76  // Then convert the string to lowercase:
77  if (!$this->lexerConf['casesensitive']) {
78  $wordString = mb_strtolower($wordString, 'utf-8');
79  }
80  // Now, splitting words:
81  $pos = 0;
82  $words = [];
83  $this->debugString = '';
84  while (1) {
85  [$start, $len] = $this->‪get_word($wordString, $pos);
86  if ($len) {
87  $this->‪addWords($words, $wordString, $start, $len);
88  if ($this->‪debug) {
89  $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr(
90  $wordString,
91  $pos,
92  $start - $pos
93  )) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
94  }
95  $pos = $start + $len;
96  } else {
97  break;
98  }
99  }
100  return $words;
101  }
102 
103  /**********************************
104  *
105  * Helper functions
106  *
107  ********************************/
117  public function ‪addWords(&$words, &$wordString, $start, $len)
118  {
119  // Get word out of string:
120  $theWord = substr($wordString, $start, $len);
121  // Get next chars unicode number and find type:
122  $bc = 0;
123  $cp = $this->‪utf8_ord($theWord, $bc);
124  $cType = $this->‪charType((int)$cp);
125  // If string is a CJK sequence we follow this algorithm:
126  /*
127  DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
128  separate letters and numbers into words. This is sufficient for
129  all western text.CJK doesn't use spaces or separators to separate words, so the only
130  way to really find out what constitutes a word would be to have a
131  dictionary and advanced heuristics. Instead, we form pairs from
132  consecutive characters, in such a way that searches will find only
133  characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
134  in the same manner, and since the set of characters is huge so the
135  extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
136  */
137  if ($cType === self::CHARTYPE_CJK) {
138  // Find total string length:
139  $strlen = mb_strlen($theWord, 'utf-8');
140  // Traverse string length and add words as pairs of two chars:
141  for ($a = 0; $a < $strlen; $a++) {
142  if ($strlen == 1 || $a < $strlen - 1) {
143  $words[] = mb_substr($theWord, $a, 2, 'utf-8');
144  }
145  }
146  } else {
147  // Normal "single-byte" chars:
148  // Remove chars:
149  $charsetConverter = GeneralUtility::makeInstance(CharsetConverter::class);
150  foreach ($this->lexerConf['removeChars'] as $skipJoin) {
151  $theWord = str_replace($charsetConverter->UnumberToChar($skipJoin), '', $theWord);
152  }
153  // Add word:
154  $words[] = $theWord;
155  }
156  }
157 
165  public function ‪get_word(&$str, $pos = 0)
166  {
167  $len = 0;
168  // If return is TRUE, a word was found starting at this position, so returning position and length:
169  if ($this->‪utf8_is_letter($str, $len, $pos)) {
170  return [$pos, $len];
171  }
172  // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
173  $pos += $len;
174  if ((string)($str[$pos] ?? '') === '') {
175  // Check end of string before looking for word of course.
176  return false;
177  }
178  $this->‪utf8_is_letter($str, $len, $pos);
179  return [$pos, $len];
180  }
181 
190  public function ‪utf8_is_letter(&$str, &$len, $pos = 0)
191  {
192  $len = 0;
193  $bc = 0;
194  $cp = 0;
195  $printJoinLgd = 0;
196  $cType = ($cType_prev = false);
197  // Letter type
198  $letter = true;
199  // looking for a letter?
200  if ((string)($str[$pos] ?? '') === '') {
201  // Return FALSE on end-of-string at this stage
202  return false;
203  }
204  while (1) {
205  // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
206  if ($len) {
207  if ($letter) {
208  // We are in a sequence of words
209  if (
210  !$cType
211  || $cType_prev === self::CHARTYPE_CJK && ($cType === self::CHARTYPE_NUMBER || $cType === self::CHARTYPE_ALPHA)
212  || $cType === self::CHARTYPE_CJK && ($cType_prev === self::CHARTYPE_NUMBER || $cType_prev === ‪self::CHARTYPE_ALPHA)
213  ) {
214  // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
215  if (!in_array($cp, $this->lexerConf['printjoins'])) {
216  // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
217  if ($printJoinLgd) {
218  $len = $printJoinLgd;
219  }
220  return true;
221  }
222  // If a printJoin char is found, record the length if it has not been recorded already:
223  if (!$printJoinLgd) {
224  $printJoinLgd = $len;
225  }
226  } else {
227  // When a true letter is found, reset printJoinLgd counter:
228  $printJoinLgd = 0;
229  }
230  } elseif (!$letter && $cType) {
231  // end of non-word reached
232  return false;
233  }
234  }
235  $len += $bc;
236  // add byte-length of last found character
237  if ((string)($str[$pos] ?? '') === '') {
238  // End of string; return status of string till now
239  return $letter;
240  }
241  // Get next chars unicode number:
242  $cp = $this->‪utf8_ord($str, $bc, $pos);
243  $pos += $bc;
244  // Determine the type:
245  $cType_prev = $cType;
246  $cType = $this->‪charType((int)$cp);
247  if ($cType !== null) {
248  continue;
249  }
250  // Setting letter to FALSE if the first char was not a letter!
251  if (!$len) {
252  $letter = false;
253  }
254  }
255  return false;
256  }
257 
264  public function ‪charType($cp)
265  {
266  // Numeric?
267  if ($cp >= 48 && $cp <= 57) {
269  }
270  // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
271  if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
273  }
274  // Looking for CJK (Chinese / Japanese / Korean)
275  // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
276  // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
277  if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
278  return ‪self::CHARTYPE_CJK;
279  }
280  return null;
281  }
282 
292  public function ‪utf8_ord(&$str, &$len, $pos = 0, $hex = false)
293  {
294  $ord = ord($str[$pos]);
295  $len = 1;
296  if ($ord > 128) {
297  for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
298  // calculate number of extra bytes
299  $bc++;
300  }
301  $len += $bc;
302  $ord = $ord & (1 << 6 - $bc) - 1;
303  // mask utf-8 lead-in bytes
304  // "bring in" data bytes
305  for ($i = $pos + 1; $bc; $bc--, $i++) {
306  $ord = $ord << 6 | ord($str[$i]) & 63;
307  }
308  }
309  return $hex ? 'x' . dechex($ord) : $ord;
310  }
311 }
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\IndexedSearch\Lexer\get_word
‪array bool get_word(&$str, $pos=0)
Definition: Lexer.php:162
‪TYPO3\CMS\IndexedSearch\Lexer\$lexerConf
‪array $lexerConf
Definition: Lexer.php:49
‪TYPO3\CMS\IndexedSearch\Lexer\CHARTYPE_NUMBER
‪const CHARTYPE_NUMBER
Definition: Lexer.php:28
‪TYPO3\CMS\IndexedSearch\Lexer\CHARTYPE_ALPHA
‪const CHARTYPE_ALPHA
Definition: Lexer.php:29
‪TYPO3\CMS\Core\Charset\CharsetConverter
Definition: CharsetConverter.php:54
‪TYPO3\CMS\IndexedSearch\Lexer\utf8_is_letter
‪bool utf8_is_letter(&$str, &$len, $pos=0)
Definition: Lexer.php:187
‪TYPO3\CMS\IndexedSearch\Lexer\split2Words
‪array split2Words($wordString)
Definition: Lexer.php:69
‪TYPO3\CMS\IndexedSearch\Lexer\$debugString
‪string $debugString
Definition: Lexer.php:43
‪TYPO3\CMS\IndexedSearch\Lexer\charType
‪string null charType($cp)
Definition: Lexer.php:261
‪TYPO3\CMS\IndexedSearch\Lexer\CHARTYPE_CJK
‪const CHARTYPE_CJK
Definition: Lexer.php:31
‪debug
‪debug($variable='', $title=null, $group=null)
Definition: GlobalDebugFunctions.php:19
‪TYPO3\CMS\IndexedSearch\Lexer
Definition: Lexer.php:27
‪TYPO3\CMS\IndexedSearch\Lexer\$debug
‪bool $debug
Definition: Lexer.php:37
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:50
‪TYPO3\CMS\IndexedSearch\Lexer\addWords
‪addWords(&$words, &$wordString, $start, $len)
Definition: Lexer.php:114
‪TYPO3\CMS\IndexedSearch\Lexer\utf8_ord
‪int string utf8_ord(&$str, &$len, $pos=0, $hex=false)
Definition: Lexer.php:289