‪TYPO3CMS  9.5
Lexer.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
20 
26 class ‪Lexer
27 {
29 
34  protected ‪$deprecatedPublicProperties = [
35  'csObj' => 'Using $csObj within Indexing is discouraged, the property will be removed in TYPO3 v10.0 - if needed instantiate CharsetConverter yourself.',
36  ];
37 
43  public ‪$debug = false;
44 
50  public ‪$debugString = '';
51 
58  public ‪$csObj;
59 
65  public ‪$lexerConf = [
66  //Characters: . - _ : / '
67  'printjoins' => [46, 45, 95, 58, 47, 39],
68  'casesensitive' => false,
69  // Set, if case sensitive indexing is wanted.
70  'removeChars' => [45]
71  ];
72 
76  public function ‪__construct()
77  {
78  // @deprecated, can be removed in TYPO3 v10.0.
79  $this->csObj = GeneralUtility::makeInstance(CharsetConverter::class);
80  }
81 
89  public function ‪split2Words($wordString)
90  {
91  // Reset debug string:
92  $this->debugString = '';
93  // Then convert the string to lowercase:
94  if (!$this->lexerConf['casesensitive']) {
95  $wordString = mb_strtolower($wordString, 'utf-8');
96  }
97  // Now, splitting words:
98  $len = 0;
99  $start = 0;
100  $pos = 0;
101  $words = [];
102  $this->debugString = '';
103  while (1) {
104  list($start, $len) = $this->‪get_word($wordString, $pos);
105  if ($len) {
106  $this->‪addWords($words, $wordString, $start, $len);
107  if ($this->‪debug) {
108  $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr(
109  $wordString,
110  $pos,
111  $start - $pos
112  )) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
113  }
114  $pos = $start + $len;
115  } else {
116  break;
117  }
118  }
119  return $words;
120  }
121 
122  /**********************************
123  *
124  * Helper functions
125  *
126  ********************************/
136  public function ‪addWords(&$words, &$wordString, $start, $len)
137  {
138  // Get word out of string:
139  $theWord = substr($wordString, $start, $len);
140  // Get next chars unicode number and find type:
141  $bc = 0;
142  $cp = $this->‪utf8_ord($theWord, $bc);
143  list($cType) = $this->‪charType($cp);
144  // If string is a CJK sequence we follow this algorithm:
145  /*
146  DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
147  separate letters and numbers into words. This is sufficient for
148  all western text.CJK doesn't use spaces or separators to separate words, so the only
149  way to really find out what constitutes a word would be to have a
150  dictionary and advanced heuristics. Instead, we form pairs from
151  consecutive characters, in such a way that searches will find only
152  characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
153  in the same manner, and since the set of characters is huge so the
154  extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
155  */
156  if ($cType === 'cjk') {
157  // Find total string length:
158  $strlen = mb_strlen($theWord, 'utf-8');
159  // Traverse string length and add words as pairs of two chars:
160  for ($a = 0; $a < $strlen; $a++) {
161  if ($strlen == 1 || $a < $strlen - 1) {
162  $words[] = mb_substr($theWord, $a, 2, 'utf-8');
163  }
164  }
165  } else {
166  // Normal "single-byte" chars:
167  // Remove chars:
168  $charsetConverter = GeneralUtility::makeInstance(CharsetConverter::class);
169  foreach ($this->lexerConf['removeChars'] as $skipJoin) {
170  $theWord = str_replace($charsetConverter->UnumberToChar($skipJoin), '', $theWord);
171  }
172  // Add word:
173  $words[] = $theWord;
174  }
175  }
176 
184  public function ‪get_word(&$str, $pos = 0)
185  {
186  $len = 0;
187  // If return is TRUE, a word was found starting at this position, so returning position and length:
188  if ($this->‪utf8_is_letter($str, $len, $pos)) {
189  return [$pos, $len];
190  }
191  // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
192  $pos += $len;
193  if ($str[$pos] == '') {
194  // Check end of string before looking for word of course.
195  return false;
196  }
197  $this->‪utf8_is_letter($str, $len, $pos);
198  return [$pos, $len];
199  }
200 
209  public function ‪utf8_is_letter(&$str, &$len, $pos = 0)
210  {
211  $len = 0;
212  $bc = 0;
213  $cp = 0;
214  $printJoinLgd = 0;
215  $cType = ($cType_prev = false);
216  // Letter type
217  $letter = true;
218  // looking for a letter?
219  if ($str[$pos] == '') {
220  // Return FALSE on end-of-string at this stage
221  return false;
222  }
223  while (1) {
224  // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
225  if ($len) {
226  if ($letter) {
227  // We are in a sequence of words
228  if (
229  !$cType
230  || $cType_prev === 'cjk' && ($cType === 'num' || $cType === 'alpha')
231  || $cType === 'cjk' && ($cType_prev === 'num' || $cType_prev === 'alpha')
232  ) {
233  // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
234  if (!in_array($cp, $this->lexerConf['printjoins'])) {
235  // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
236  if ($printJoinLgd) {
237  $len = $printJoinLgd;
238  }
239  return true;
240  }
241  // If a printJoin char is found, record the length if it has not been recorded already:
242  if (!$printJoinLgd) {
243  $printJoinLgd = $len;
244  }
245  } else {
246  // When a true letter is found, reset printJoinLgd counter:
247  $printJoinLgd = 0;
248  }
249  } elseif (!$letter && $cType) {
250  // end of non-word reached
251  return false;
252  }
253  }
254  $len += $bc;
255  // add byte-length of last found character
256  if ($str[$pos] == '') {
257  // End of string; return status of string till now
258  return $letter;
259  }
260  // Get next chars unicode number:
261  $cp = $this->‪utf8_ord($str, $bc, $pos);
262  $pos += $bc;
263  // Determine the type:
264  $cType_prev = $cType;
265  list($cType) = $this->‪charType($cp);
266  if ($cType) {
267  continue;
268  }
269  // Setting letter to FALSE if the first char was not a letter!
270  if (!$len) {
271  $letter = false;
272  }
273  }
274  return false;
275  }
276 
283  public function ‪charType($cp)
284  {
285  // Numeric?
286  if ($cp >= 48 && $cp <= 57) {
287  return ['num'];
288  }
289  // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
290  if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
291  return ['alpha'];
292  }
293  // Looking for CJK (Chinese / Japanese / Korean)
294  // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
295  // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
296  if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
297  return ['cjk'];
298  }
299  }
300 
310  public function ‪utf8_ord(&$str, &$len, $pos = 0, $hex = false)
311  {
312  $ord = ord($str[$pos]);
313  $len = 1;
314  if ($ord > 128) {
315  for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
316  // calculate number of extra bytes
317  $bc++;
318  }
319  $len += $bc;
320  $ord = $ord & (1 << 6 - $bc) - 1;
321  // mask utf-8 lead-in bytes
322  // "bring in" data bytes
323  for ($i = $pos + 1; $bc; $bc--, $i++) {
324  $ord = $ord << 6 | ord($str[$i]) & 63;
325  }
326  }
327  return $hex ? 'x' . dechex($ord) : $ord;
328  }
329 }
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\IndexedSearch\Lexer\$csObj
‪CharsetConverter $csObj
Definition: Lexer.php:53
‪TYPO3\CMS\IndexedSearch\Lexer\get_word
‪array get_word(&$str, $pos=0)
Definition: Lexer.php:178
‪TYPO3\CMS\IndexedSearch\Lexer\__construct
‪__construct()
Definition: Lexer.php:70
‪TYPO3\CMS\IndexedSearch\Lexer\$lexerConf
‪array $lexerConf
Definition: Lexer.php:59
‪TYPO3\CMS\IndexedSearch\Lexer\charType
‪array charType($cp)
Definition: Lexer.php:277
‪TYPO3\CMS\Core\Charset\CharsetConverter
Definition: CharsetConverter.php:54
‪TYPO3\CMS\IndexedSearch\Lexer\$deprecatedPublicProperties
‪array $deprecatedPublicProperties
Definition: Lexer.php:32
‪TYPO3\CMS\IndexedSearch\Lexer\utf8_is_letter
‪bool utf8_is_letter(&$str, &$len, $pos=0)
Definition: Lexer.php:203
‪TYPO3\CMS\IndexedSearch\Lexer\utf8_ord
‪int utf8_ord(&$str, &$len, $pos=0, $hex=false)
Definition: Lexer.php:304
‪TYPO3\CMS\IndexedSearch\Lexer\split2Words
‪array split2Words($wordString)
Definition: Lexer.php:83
‪TYPO3\CMS\IndexedSearch\Lexer\$debugString
‪string $debugString
Definition: Lexer.php:46
‪debug
‪debug($variable='', $title=null, $group=null)
Definition: GlobalDebugFunctions.php:5
‪TYPO3\CMS\IndexedSearch\Lexer
Definition: Lexer.php:27
‪TYPO3\CMS\IndexedSearch\Lexer\$debug
‪bool $debug
Definition: Lexer.php:40
‪TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait
Definition: PublicPropertyDeprecationTrait.php:66
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:45
‪TYPO3\CMS\IndexedSearch\Lexer\addWords
‪addWords(&$words, &$wordString, $start, $len)
Definition: Lexer.php:130