TYPO3 CMS  TYPO3_6-2
Lexer.php
Go to the documentation of this file.
1 <?php
3 
27 class Lexer {
28 
29  // Debugging options:
33  public $debug = FALSE;
34 
35  // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
39  public $debugString = '';
40 
47  public $csObj;
48 
49  // Configuration of the lexer:
53  public $lexerConf = array(
54  'printjoins' => array(46, 45, 95, 58, 47, 39),
55  'casesensitive' => FALSE,
56  // Set, if case sensitive indexing is wanted.
57  'removeChars' => array(45)
58  );
59 
66  public function __construct() {
67  $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
68  }
69 
78  public function split2Words($wordString) {
79  // Reset debug string:
80  $this->debugString = '';
81  // Then convert the string to lowercase:
82  if (!$this->lexerConf['casesensitive']) {
83  $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
84  }
85  // Now, splitting words:
86  $len = 0;
87  $start = 0;
88  $pos = 0;
89  $words = array();
90  $this->debugString = '';
91  while (1) {
92  list($start, $len) = $this->get_word($wordString, $pos);
93  if ($len) {
94  $this->addWords($words, $wordString, $start, $len);
95  if ($this->debug) {
96  $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
97  }
98  $pos = $start + $len;
99  } else {
100  break;
101  }
102  }
103  return $words;
104  }
105 
106  /**********************************
107  *
108  * Helper functions
109  *
110  ********************************/
122  public function addWords(&$words, &$wordString, $start, $len) {
123  // Get word out of string:
124  $theWord = substr($wordString, $start, $len);
125  // Get next chars unicode number and find type:
126  $bc = 0;
127  $cp = $this->utf8_ord($theWord, $bc);
128  list($cType) = $this->charType($cp);
129  // If string is a CJK sequence we follow this algorithm:
130  /*
131  DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
132  separate letters and numbers into words. This is sufficient for
133  all western text.CJK doesn't use spaces or separators to separate words, so the only
134  way to really find out what constitutes a word would be to have a
135  dictionary and advanced heuristics. Instead, we form pairs from
136  consecutive characters, in such a way that searches will find only
137  characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
138  in the same manner, and since the set of characters is huge so the
139  extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
140  */
141  if ($cType == 'cjk') {
142  // Find total string length:
143  $strlen = $this->csObj->utf8_strlen($theWord);
144  // Traverse string length and add words as pairs of two chars:
145  for ($a = 0; $a < $strlen; $a++) {
146  if ($strlen == 1 || $a < $strlen - 1) {
147  $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
148  }
149  }
150  } else {
151  // Normal "single-byte" chars:
152  // Remove chars:
153  foreach ($this->lexerConf['removeChars'] as $skipJoin) {
154  $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
155  }
156  // Add word:
157  $words[] = $theWord;
158  }
159  }
160 
169  public function get_word(&$str, $pos = 0) {
170  $len = 0;
171  // If return is TRUE, a word was found starting at this position, so returning position and length:
172  if ($this->utf8_is_letter($str, $len, $pos)) {
173  return array($pos, $len);
174  }
175  // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
176  $pos += $len;
177  if ($str[$pos] == '') {
178  // Check end of string before looking for word of course.
179  return FALSE;
180  }
181  $this->utf8_is_letter($str, $len, $pos);
182  return array($pos, $len);
183  }
184 
194  public function utf8_is_letter(&$str, &$len, $pos = 0) {
195  global $cs;
196  $len = 0;
197  $bc = 0;
198  $cType = ($cType_prev = FALSE);
199  // Letter type
200  $letter = TRUE;
201  // looking for a letter?
202  if ($str[$pos] == '') {
203  // Return FALSE on end-of-string at this stage
204  return FALSE;
205  }
206  while (1) {
207  // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
208  if ($len) {
209  if ($letter) {
210  // We are in a sequence of words
211  if (!$cType || $cType_prev == 'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList('num,alpha', $cType) || $cType == 'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList('num,alpha', $cType_prev)) {
212  // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
213  if (!in_array($cp, $this->lexerConf['printjoins'])) {
214  // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
215  if ($printJoinLgd) {
216  $len = $printJoinLgd;
217  }
218  return TRUE;
219  } else {
220  // If a printJoin char is found, record the length if it has not been recorded already:
221  if (!$printJoinLgd) {
222  $printJoinLgd = $len;
223  }
224  }
225  } else {
226  // When a true letter is found, reset printJoinLgd counter:
227  $printJoinLgd = 0;
228  }
229  } elseif (!$letter && $cType) {
230  // end of non-word reached
231  return FALSE;
232  }
233  }
234  $len += $bc;
235  // add byte-length of last found character
236  if ($str[$pos] == '') {
237  // End of string; return status of string till now
238  return $letter;
239  }
240  // Get next chars unicode number:
241  $cp = $this->utf8_ord($str, $bc, $pos);
242  $pos += $bc;
243  // Determine the type:
244  $cType_prev = $cType;
245  list($cType) = $this->charType($cp);
246  if ($cType) {
247  continue;
248  }
249  // Setting letter to FALSE if the first char was not a letter!
250  if (!$len) {
251  $letter = FALSE;
252  }
253  }
254  return FALSE;
255  }
256 
264  public function charType($cp) {
265  // Numeric?
266  if ($cp >= 48 && $cp <= 57) {
267  return array('num');
268  }
269  // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
270  if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
271  return array('alpha');
272  }
273  // Looking for CJK (Chinese / Japanese / Korean)
274  // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
275  // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
276  if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
277  return array('cjk');
278  }
279  }
280 
291  public function utf8_ord(&$str, &$len, $pos = 0, $hex = FALSE) {
292  $ord = ord($str[$pos]);
293  $len = 1;
294  if ($ord > 128) {
295  for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
296  // calculate number of extra bytes
297  $bc++;
298  }
299  $len += $bc;
300  $ord = $ord & (1 << 6 - $bc) - 1;
301  // mask utf-8 lead-in bytes
302  // "bring in" data bytes
303  for ($i = $pos + 1; $bc; $bc--, $i++) {
304  $ord = $ord << 6 | ord($str[$i]) & 63;
305  }
306  }
307  return $hex ? 'x' . dechex($ord) : $ord;
308  }
309 
310 }
split2Words($wordString)
Definition: Lexer.php:78
utf8_ord(&$str, &$len, $pos=0, $hex=FALSE)
Definition: Lexer.php:291
utf8_is_letter(&$str, &$len, $pos=0)
Definition: Lexer.php:194
get_word(&$str, $pos=0)
Definition: Lexer.php:169
addWords(&$words, &$wordString, $start, $len)
Definition: Lexer.php:122
debug($variable='', $name=' *variable *', $line=' *line *', $file=' *file *', $recursiveDepth=3, $debugLevel=E_DEBUG)