‪TYPO3CMS  ‪main
Lexer.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
23 class ‪Lexer
24 {
25  protected const ‪CHARTYPE_NUMBER = 'num';
26  protected const ‪CHARTYPE_ALPHA = 'alpha';
27  // CJK (Chinese / Japanese / Korean)
28  protected const ‪CHARTYPE_CJK = 'cjk';
29 
30  protected array ‪$lexerConf = [
31  'printjoins' => [
32  46, // .
33  45, // -
34  95, // _
35  58, // :
36  47, // /
37  39, // '
38  ],
39  'casesensitive' => false, // Set, if case-sensitive indexing is wanted
40  ];
41 
49  public function ‪split2Words(string $wordString): array
50  {
51  // Convert the string to lowercase:
52  if (!$this->lexerConf['casesensitive']) {
53  $wordString = mb_strtolower($wordString, 'utf-8');
54  }
55  // Now, splitting words:
56  $pos = 0;
57  $words = [];
58  while (1) {
59  [$start, $len] = $this->‪get_word($wordString, $pos);
60  if ($len) {
61  $this->‪addWords($words, $wordString, $start, $len);
62  $pos = $start + $len;
63  } else {
64  break;
65  }
66  }
67  return $words;
68  }
69 
70  /**********************************
71  *
72  * Helper functions
73  *
74  ********************************/
84  public function ‪addWords(array &$words, string &$wordString, int $start, int $len): void
85  {
86  // Get word out of string:
87  $theWord = substr($wordString, $start, $len);
88  // Get next chars unicode number and find type:
89  $bc = 0;
90  $cp = $this->‪utf8_ord($theWord, $bc);
91  $cType = $this->‪charType((int)$cp);
92  // If string is a CJK sequence we follow this algorithm:
93  /*
94  DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
95  separate letters and numbers into words. This is sufficient for
96  all western text.CJK doesn't use spaces or separators to separate words, so the only
97  way to really find out what constitutes a word would be to have a
98  dictionary and advanced heuristics. Instead, we form pairs from
99  consecutive characters, in such a way that searches will find only
100  characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
101  in the same manner, and since the set of characters is huge so the
102  extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
103  */
104  if ($cType === self::CHARTYPE_CJK) {
105  // Find total string length:
106  $strlen = mb_strlen($theWord, 'utf-8');
107  // Traverse string length and add words as pairs of two chars:
108  for ($a = 0; $a < $strlen; $a++) {
109  if ($strlen === 1 || $a < $strlen - 1) {
110  $words[] = mb_substr($theWord, $a, 2, 'utf-8');
111  }
112  }
113  } else {
114  // Add word:
115  $words[] = $theWord;
116  }
117  }
118 
126  public function ‪get_word(string &$str, int $pos = 0): bool|array
127  {
128  $len = 0;
129  // If return is TRUE, a word was found starting at this position, so returning position and length:
130  if ($this->‪utf8_is_letter($str, $len, $pos)) {
131  return [$pos, $len];
132  }
133  // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
134  $pos += $len;
135  if ((string)($str[$pos] ?? '') === '') {
136  // Check end of string before looking for word of course.
137  return false;
138  }
139  $this->‪utf8_is_letter($str, $len, $pos);
140  return [$pos, $len];
141  }
142 
151  public function ‪utf8_is_letter(string &$str, int &$len, int $pos = 0): bool
152  {
153  $len = 0;
154  $bc = 0;
155  $cp = 0;
156  $printJoinLgd = 0;
157  $cType = ($cType_prev = false);
158  // Letter type
159  $letter = true;
160  // looking for a letter?
161  if (($str[$pos] ?? '') === '') {
162  // Return FALSE on end-of-string at this stage
163  return false;
164  }
165  while (true) {
166  // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
167  if ($len) {
168  if ($letter) {
169  // We are in a sequence of words
170  if (
171  !$cType
172  || $cType_prev === self::CHARTYPE_CJK && ($cType === self::CHARTYPE_NUMBER || $cType === self::CHARTYPE_ALPHA)
173  || $cType === self::CHARTYPE_CJK && ($cType_prev === self::CHARTYPE_NUMBER || $cType_prev === ‪self::CHARTYPE_ALPHA)
174  ) {
175  // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
176  if (!in_array($cp, $this->lexerConf['printjoins'], true)) {
177  // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
178  if ($printJoinLgd) {
179  $len = $printJoinLgd;
180  }
181  return true;
182  }
183  // If a printJoin char is found, record the length if it has not been recorded already:
184  if (!$printJoinLgd) {
185  $printJoinLgd = $len;
186  }
187  } else {
188  // When a true letter is found, reset printJoinLgd counter:
189  $printJoinLgd = 0;
190  }
191  } elseif ($cType) {
192  // end of non-word reached
193  return false;
194  }
195  }
196  $len += $bc;
197  // add byte-length of last found character
198  if ((string)($str[$pos] ?? '') === '') {
199  // End of string; return status of string till now
200  return $letter;
201  }
202  // Get next chars unicode number:
203  $cp = $this->‪utf8_ord($str, $bc, $pos);
204  $pos += $bc;
205  // Determine the type:
206  $cType_prev = $cType;
207  $cType = $this->‪charType((int)$cp);
208  if ($cType !== null) {
209  continue;
210  }
211  // Setting letter to FALSE if the first char was not a letter!
212  if (!$len) {
213  $letter = false;
214  }
215  }
216  }
217 
224  public function ‪charType(int $cp): ?string
225  {
226  // Numeric?
227  if ($cp >= 48 && $cp <= 57) {
229  }
230  // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
231  if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
233  }
234  // Looking for CJK (Chinese / Japanese / Korean)
235  // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
236  // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
237  if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
238  return ‪self::CHARTYPE_CJK;
239  }
240  return null;
241  }
242 
252  public function ‪utf8_ord(string &$str, int &$len, int $pos = 0, bool $hex = false): int|string
253  {
254  $ord = ord($str[$pos]);
255  $len = 1;
256  if ($ord > 128) {
257  for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs <<= 1) {
258  // calculate number of extra bytes
259  $bc++;
260  }
261  $len += $bc;
262  $ord = $ord & (1 << 6 - $bc) - 1;
263  // mask utf-8 lead-in bytes
264  // "bring in" data bytes
265  for ($i = $pos + 1; $bc; $bc--, $i++) {
266  $ord = $ord << 6 | ord($str[$i]) & 63;
267  }
268  }
269  return $hex ? 'x' . dechex($ord) : $ord;
270  }
271 }
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\IndexedSearch\Lexer\split2Words
‪array split2Words(string $wordString)
Definition: Lexer.php:49
‪TYPO3\CMS\IndexedSearch\Lexer\utf8_is_letter
‪bool utf8_is_letter(string &$str, int &$len, int $pos=0)
Definition: Lexer.php:151
‪TYPO3\CMS\IndexedSearch\Lexer\$lexerConf
‪array $lexerConf
Definition: Lexer.php:30
‪TYPO3\CMS\IndexedSearch\Lexer\CHARTYPE_NUMBER
‪const CHARTYPE_NUMBER
Definition: Lexer.php:25
‪TYPO3\CMS\IndexedSearch\Lexer\CHARTYPE_ALPHA
‪const CHARTYPE_ALPHA
Definition: Lexer.php:26
‪TYPO3\CMS\IndexedSearch\Lexer\utf8_ord
‪int string utf8_ord(string &$str, int &$len, int $pos=0, bool $hex=false)
Definition: Lexer.php:252
‪TYPO3\CMS\IndexedSearch\Lexer\charType
‪string null charType(int $cp)
Definition: Lexer.php:224
‪TYPO3\CMS\IndexedSearch\Lexer\addWords
‪addWords(array &$words, string &$wordString, int $start, int $len)
Definition: Lexer.php:84
‪TYPO3\CMS\IndexedSearch\Lexer\get_word
‪array bool get_word(string &$str, int $pos=0)
Definition: Lexer.php:126
‪TYPO3\CMS\IndexedSearch\Lexer\CHARTYPE_CJK
‪const CHARTYPE_CJK
Definition: Lexer.php:28
‪TYPO3\CMS\IndexedSearch\Lexer
Definition: Lexer.php:24