TYPO3 CMS  TYPO3_8-7
Lexer.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
21 class Lexer
22 {
28  public $debug = false;
29 
35  public $debugString = '';
36 
42  public $csObj;
43 
49  public $lexerConf = [
50  //Characters: . - _ : / '
51  'printjoins' => [46, 45, 95, 58, 47, 39],
52  'casesensitive' => false,
53  // Set, if case sensitive indexing is wanted.
54  'removeChars' => [45]
55  ];
56 
60  public function __construct()
61  {
62  $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
63  }
64 
72  public function split2Words($wordString)
73  {
74  // Reset debug string:
75  $this->debugString = '';
76  // Then convert the string to lowercase:
77  if (!$this->lexerConf['casesensitive']) {
78  $wordString = mb_strtolower($wordString, 'utf-8');
79  }
80  // Now, splitting words:
81  $len = 0;
82  $start = 0;
83  $pos = 0;
84  $words = [];
85  $this->debugString = '';
86  while (1) {
87  list($start, $len) = $this->get_word($wordString, $pos);
88  if ($len) {
89  $this->addWords($words, $wordString, $start, $len);
90  if ($this->debug) {
91  $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
92  }
93  $pos = $start + $len;
94  } else {
95  break;
96  }
97  }
98  return $words;
99  }
100 
101  /**********************************
102  *
103  * Helper functions
104  *
105  ********************************/
115  public function addWords(&$words, &$wordString, $start, $len)
116  {
117  // Get word out of string:
118  $theWord = substr($wordString, $start, $len);
119  // Get next chars unicode number and find type:
120  $bc = 0;
121  $cp = $this->utf8_ord($theWord, $bc);
122  list($cType) = $this->charType($cp);
123  // If string is a CJK sequence we follow this algorithm:
124  /*
125  DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
126  separate letters and numbers into words. This is sufficient for
127  all western text.CJK doesn't use spaces or separators to separate words, so the only
128  way to really find out what constitutes a word would be to have a
129  dictionary and advanced heuristics. Instead, we form pairs from
130  consecutive characters, in such a way that searches will find only
131  characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
132  in the same manner, and since the set of characters is huge so the
133  extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
134  */
135  if ($cType === 'cjk') {
136  // Find total string length:
137  $strlen = mb_strlen($theWord, 'utf-8');
138  // Traverse string length and add words as pairs of two chars:
139  for ($a = 0; $a < $strlen; $a++) {
140  if ($strlen == 1 || $a < $strlen - 1) {
141  $words[] = mb_substr($theWord, $a, 2, 'utf-8');
142  }
143  }
144  } else {
145  // Normal "single-byte" chars:
146  // Remove chars:
147  foreach ($this->lexerConf['removeChars'] as $skipJoin) {
148  $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
149  }
150  // Add word:
151  $words[] = $theWord;
152  }
153  }
154 
162  public function get_word(&$str, $pos = 0)
163  {
164  $len = 0;
165  // If return is TRUE, a word was found starting at this position, so returning position and length:
166  if ($this->utf8_is_letter($str, $len, $pos)) {
167  return [$pos, $len];
168  }
169  // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
170  $pos += $len;
171  if ($str[$pos] == '') {
172  // Check end of string before looking for word of course.
173  return false;
174  }
175  $this->utf8_is_letter($str, $len, $pos);
176  return [$pos, $len];
177  }
178 
187  public function utf8_is_letter(&$str, &$len, $pos = 0)
188  {
189  $len = 0;
190  $bc = 0;
191  $cp = 0;
192  $printJoinLgd = 0;
193  $cType = ($cType_prev = false);
194  // Letter type
195  $letter = true;
196  // looking for a letter?
197  if ($str[$pos] == '') {
198  // Return FALSE on end-of-string at this stage
199  return false;
200  }
201  while (1) {
202  // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
203  if ($len) {
204  if ($letter) {
205  // We are in a sequence of words
206  if (
207  !$cType
208  || $cType_prev === 'cjk' && ($cType === 'num' || $cType === 'alpha')
209  || $cType === 'cjk' && ($cType_prev === 'num' || $cType_prev === 'alpha')
210  ) {
211  // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
212  if (!in_array($cp, $this->lexerConf['printjoins'])) {
213  // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
214  if ($printJoinLgd) {
215  $len = $printJoinLgd;
216  }
217  return true;
218  }
219  // If a printJoin char is found, record the length if it has not been recorded already:
220  if (!$printJoinLgd) {
221  $printJoinLgd = $len;
222  }
223  } else {
224  // When a true letter is found, reset printJoinLgd counter:
225  $printJoinLgd = 0;
226  }
227  } elseif (!$letter && $cType) {
228  // end of non-word reached
229  return false;
230  }
231  }
232  $len += $bc;
233  // add byte-length of last found character
234  if ($str[$pos] == '') {
235  // End of string; return status of string till now
236  return $letter;
237  }
238  // Get next chars unicode number:
239  $cp = $this->utf8_ord($str, $bc, $pos);
240  $pos += $bc;
241  // Determine the type:
242  $cType_prev = $cType;
243  list($cType) = $this->charType($cp);
244  if ($cType) {
245  continue;
246  }
247  // Setting letter to FALSE if the first char was not a letter!
248  if (!$len) {
249  $letter = false;
250  }
251  }
252  return false;
253  }
254 
261  public function charType($cp)
262  {
263  // Numeric?
264  if ($cp >= 48 && $cp <= 57) {
265  return ['num'];
266  }
267  // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
268  if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
269  return ['alpha'];
270  }
271  // Looking for CJK (Chinese / Japanese / Korean)
272  // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
273  // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
274  if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
275  return ['cjk'];
276  }
277  }
278 
288  public function utf8_ord(&$str, &$len, $pos = 0, $hex = false)
289  {
290  $ord = ord($str[$pos]);
291  $len = 1;
292  if ($ord > 128) {
293  for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
294  // calculate number of extra bytes
295  $bc++;
296  }
297  $len += $bc;
298  $ord = $ord & (1 << 6 - $bc) - 1;
299  // mask utf-8 lead-in bytes
300  // "bring in" data bytes
301  for ($i = $pos + 1; $bc; $bc--, $i++) {
302  $ord = $ord << 6 | ord($str[$i]) & 63;
303  }
304  }
305  return $hex ? 'x' . dechex($ord) : $ord;
306  }
307 }
debug($variable='', $name=' *variable *', $line=' *line *', $file=' *file *', $recursiveDepth=3, $debugLevel='E_DEBUG')
split2Words($wordString)
Definition: Lexer.php:72
static makeInstance($className,... $constructorArguments)
utf8_is_letter(&$str, &$len, $pos=0)
Definition: Lexer.php:187
utf8_ord(&$str, &$len, $pos=0, $hex=false)
Definition: Lexer.php:288
get_word(&$str, $pos=0)
Definition: Lexer.php:162
addWords(&$words, &$wordString, $start, $len)
Definition: Lexer.php:115