‪TYPO3CMS  10.4
Lexer.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
20 
26 class ‪Lexer
27 {
28 
34  public ‪$debug = false;
35 
41  public ‪$debugString = '';
42 
48  public ‪$lexerConf = [
49  //Characters: . - _ : / '
50  'printjoins' => [46, 45, 95, 58, 47, 39],
51  'casesensitive' => false,
52  // Set, if case sensitive indexing is wanted.
53  'removeChars' => [45]
54  ];
55 
63  public function ‪split2Words($wordString)
64  {
65  // Reset debug string:
66  $this->debugString = '';
67  // Then convert the string to lowercase:
68  if (!$this->lexerConf['casesensitive']) {
69  $wordString = mb_strtolower($wordString, 'utf-8');
70  }
71  // Now, splitting words:
72  $pos = 0;
73  $words = [];
74  $this->debugString = '';
75  while (1) {
76  [$start, $len] = $this->‪get_word($wordString, $pos);
77  if ($len) {
78  $this->‪addWords($words, $wordString, $start, $len);
79  if ($this->‪debug) {
80  $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr(
81  $wordString,
82  $pos,
83  $start - $pos
84  )) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
85  }
86  $pos = $start + $len;
87  } else {
88  break;
89  }
90  }
91  return $words;
92  }
93 
94  /**********************************
95  *
96  * Helper functions
97  *
98  ********************************/
108  public function ‪addWords(&$words, &$wordString, $start, $len)
109  {
110  // Get word out of string:
111  $theWord = substr($wordString, $start, $len);
112  // Get next chars unicode number and find type:
113  $bc = 0;
114  $cp = $this->‪utf8_ord($theWord, $bc);
115  [$cType] = $this->‪charType($cp);
116  // If string is a CJK sequence we follow this algorithm:
117  /*
118  DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
119  separate letters and numbers into words. This is sufficient for
120  all western text.CJK doesn't use spaces or separators to separate words, so the only
121  way to really find out what constitutes a word would be to have a
122  dictionary and advanced heuristics. Instead, we form pairs from
123  consecutive characters, in such a way that searches will find only
124  characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
125  in the same manner, and since the set of characters is huge so the
126  extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
127  */
128  if ($cType === 'cjk') {
129  // Find total string length:
130  $strlen = mb_strlen($theWord, 'utf-8');
131  // Traverse string length and add words as pairs of two chars:
132  for ($a = 0; $a < $strlen; $a++) {
133  if ($strlen == 1 || $a < $strlen - 1) {
134  $words[] = mb_substr($theWord, $a, 2, 'utf-8');
135  }
136  }
137  } else {
138  // Normal "single-byte" chars:
139  // Remove chars:
140  $charsetConverter = GeneralUtility::makeInstance(CharsetConverter::class);
141  foreach ($this->lexerConf['removeChars'] as $skipJoin) {
142  $theWord = str_replace($charsetConverter->UnumberToChar($skipJoin), '', $theWord);
143  }
144  // Add word:
145  $words[] = $theWord;
146  }
147  }
148 
156  public function ‪get_word(&$str, $pos = 0)
157  {
158  $len = 0;
159  // If return is TRUE, a word was found starting at this position, so returning position and length:
160  if ($this->‪utf8_is_letter($str, $len, $pos)) {
161  return [$pos, $len];
162  }
163  // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
164  $pos += $len;
165  if ($str[$pos] == '') {
166  // Check end of string before looking for word of course.
167  return false;
168  }
169  $this->‪utf8_is_letter($str, $len, $pos);
170  return [$pos, $len];
171  }
172 
181  public function ‪utf8_is_letter(&$str, &$len, $pos = 0)
182  {
183  $len = 0;
184  $bc = 0;
185  $cp = 0;
186  $printJoinLgd = 0;
187  $cType = ($cType_prev = false);
188  // Letter type
189  $letter = true;
190  // looking for a letter?
191  if ($str[$pos] == '') {
192  // Return FALSE on end-of-string at this stage
193  return false;
194  }
195  while (1) {
196  // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
197  if ($len) {
198  if ($letter) {
199  // We are in a sequence of words
200  if (
201  !$cType
202  || $cType_prev === 'cjk' && ($cType === 'num' || $cType === 'alpha')
203  || $cType === 'cjk' && ($cType_prev === 'num' || $cType_prev === 'alpha')
204  ) {
205  // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
206  if (!in_array($cp, $this->lexerConf['printjoins'])) {
207  // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
208  if ($printJoinLgd) {
209  $len = $printJoinLgd;
210  }
211  return true;
212  }
213  // If a printJoin char is found, record the length if it has not been recorded already:
214  if (!$printJoinLgd) {
215  $printJoinLgd = $len;
216  }
217  } else {
218  // When a true letter is found, reset printJoinLgd counter:
219  $printJoinLgd = 0;
220  }
221  } elseif (!$letter && $cType) {
222  // end of non-word reached
223  return false;
224  }
225  }
226  $len += $bc;
227  // add byte-length of last found character
228  if ($str[$pos] == '') {
229  // End of string; return status of string till now
230  return $letter;
231  }
232  // Get next chars unicode number:
233  $cp = $this->‪utf8_ord($str, $bc, $pos);
234  $pos += $bc;
235  // Determine the type:
236  $cType_prev = $cType;
237  [$cType] = $this->‪charType($cp);
238  if ($cType) {
239  continue;
240  }
241  // Setting letter to FALSE if the first char was not a letter!
242  if (!$len) {
243  $letter = false;
244  }
245  }
246  return false;
247  }
248 
255  public function ‪charType($cp)
256  {
257  // Numeric?
258  if ($cp >= 48 && $cp <= 57) {
259  return ['num'];
260  }
261  // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
262  if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
263  return ['alpha'];
264  }
265  // Looking for CJK (Chinese / Japanese / Korean)
266  // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
267  // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
268  if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
269  return ['cjk'];
270  }
271 
272  return [];
273  }
274 
284  public function ‪utf8_ord(&$str, &$len, $pos = 0, $hex = false)
285  {
286  $ord = ord($str[$pos]);
287  $len = 1;
288  if ($ord > 128) {
289  for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
290  // calculate number of extra bytes
291  $bc++;
292  }
293  $len += $bc;
294  $ord = $ord & (1 << 6 - $bc) - 1;
295  // mask utf-8 lead-in bytes
296  // "bring in" data bytes
297  for ($i = $pos + 1; $bc; $bc--, $i++) {
298  $ord = $ord << 6 | ord($str[$i]) & 63;
299  }
300  }
301  return $hex ? 'x' . dechex($ord) : $ord;
302  }
303 }
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\IndexedSearch\Lexer\get_word
‪array get_word(&$str, $pos=0)
Definition: Lexer.php:153
‪TYPO3\CMS\IndexedSearch\Lexer\$lexerConf
‪array $lexerConf
Definition: Lexer.php:45
‪TYPO3\CMS\IndexedSearch\Lexer\charType
‪array charType($cp)
Definition: Lexer.php:252
‪TYPO3\CMS\Core\Charset\CharsetConverter
Definition: CharsetConverter.php:54
‪TYPO3\CMS\IndexedSearch\Lexer\utf8_is_letter
‪bool utf8_is_letter(&$str, &$len, $pos=0)
Definition: Lexer.php:178
‪TYPO3\CMS\IndexedSearch\Lexer\utf8_ord
‪int utf8_ord(&$str, &$len, $pos=0, $hex=false)
Definition: Lexer.php:281
‪TYPO3\CMS\IndexedSearch\Lexer\split2Words
‪array split2Words($wordString)
Definition: Lexer.php:60
‪TYPO3\CMS\IndexedSearch\Lexer\$debugString
‪string $debugString
Definition: Lexer.php:39
‪debug
‪debug($variable='', $title=null, $group=null)
Definition: GlobalDebugFunctions.php:19
‪TYPO3\CMS\IndexedSearch\Lexer
Definition: Lexer.php:27
‪TYPO3\CMS\IndexedSearch\Lexer\$debug
‪bool $debug
Definition: Lexer.php:33
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:46
‪TYPO3\CMS\IndexedSearch\Lexer\addWords
‪addWords(&$words, &$wordString, $start, $len)
Definition: Lexer.php:105