TYPO3_7-6/_lexer_8php_source.html

 <?php
 namespace TYPO3\CMS\IndexedSearch;

 /*
  * This file is part of the TYPO3 CMS project.
  *
  * It is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License, either version 2
  * of the License, or any later version.
  *
  * For the full copyright and license information, please read the
  * LICENSE.txt file that was distributed with this source code.
  *
  * The TYPO3 project - inspiring people to share!
  */

 class Lexer
 {
     public $debug = false;

     public $debugString = '';

     public $csObj;

     public $lexerConf = [
         //Characters: . - _ : / '
         'printjoins' => [46, 45, 95, 58, 47, 39],
         'casesensitive' => false,
         // Set, if case sensitive indexing is wanted.
         'removeChars' => [45]
     ];

     public function __construct()
     {
         $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
     }

     public function split2Words($wordString)
     {
         // Reset debug string:
         $this->debugString = '';
         // Then convert the string to lowercase:
         if (!$this->lexerConf['casesensitive']) {
             $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
         }
         // Now, splitting words:
         $len = 0;
         $start = 0;
         $pos = 0;
         $words = [];
         $this->debugString = '';
         while (1) {
             list($start, $len) = $this->get_word($wordString, $pos);
             if ($len) {
                 $this->addWords($words, $wordString, $start, $len);
                 if ($this->debug) {
                     $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) . '</span>' . htmlspecialchars(substr($wordString, $start, $len));
                 }
                 $pos = $start + $len;
             } else {
                 break;
             }
         }
         return $words;
     }

     /**********************************
      *
      * Helper functions
      *
      ********************************/
     public function addWords(&$words, &$wordString, $start, $len)
     {
         // Get word out of string:
         $theWord = substr($wordString, $start, $len);
         // Get next chars unicode number and find type:
         $bc = 0;
         $cp = $this->utf8_ord($theWord, $bc);
         list($cType) = $this->charType($cp);
         // If string is a CJK sequence we follow this algorithm:
         /*
         DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols
         separate letters and numbers into words. This is sufficient for
         all western text.CJK doesn't use spaces or separators to separate words, so the only
         way to really find out what constitutes a word would be to have a
         dictionary and advanced heuristics. Instead, we form pairs from
         consecutive characters, in such a way that searches will find only
         characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split
         in the same manner, and since the set of characters is huge so the
         extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!]
          */
         if ($cType == 'cjk') {
             // Find total string length:
             $strlen = $this->csObj->utf8_strlen($theWord);
             // Traverse string length and add words as pairs of two chars:
             for ($a = 0; $a < $strlen; $a++) {
                 if ($strlen == 1 || $a < $strlen - 1) {
                     $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
                 }
             }
         } else {
             // Normal "single-byte" chars:
             // Remove chars:
             foreach ($this->lexerConf['removeChars'] as $skipJoin) {
                 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin), '', $theWord);
             }
             // Add word:
             $words[] = $theWord;
         }
     }

     public function get_word(&$str, $pos = 0)
     {
         $len = 0;
         // If return is TRUE, a word was found starting at this position, so returning position and length:
         if ($this->utf8_is_letter($str, $len, $pos)) {
             return [$pos, $len];
         }
         // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
         $pos += $len;
         if ($str[$pos] == '') {
             // Check end of string before looking for word of course.
             return false;
         }
         $this->utf8_is_letter($str, $len, $pos);
         return [$pos, $len];
     }

     public function utf8_is_letter(&$str, &$len, $pos = 0)
     {
         $len = 0;
         $bc = 0;
         $cp = 0;
         $printJoinLgd = 0;
         $cType = ($cType_prev = false);
         // Letter type
         $letter = true;
         // looking for a letter?
         if ($str[$pos] == '') {
             // Return FALSE on end-of-string at this stage
             return false;
         }
         while (1) {
             // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
             if ($len) {
                 if ($letter) {
                     // We are in a sequence of words
                     if (
                         !$cType
                         || $cType_prev == 'cjk' && ($cType === 'num' || $cType === 'alpha')
                         || $cType == 'cjk' && ($cType_prev === 'num' || $cType_prev === 'alpha')
                     ) {
                         // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
                         if (!in_array($cp, $this->lexerConf['printjoins'])) {
                             // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars)
                             if ($printJoinLgd) {
                                 $len = $printJoinLgd;
                             }
                             return true;
                         } else {
                             // If a printJoin char is found, record the length if it has not been recorded already:
                             if (!$printJoinLgd) {
                                 $printJoinLgd = $len;
                             }
                         }
                     } else {
                         // When a true letter is found, reset printJoinLgd counter:
                         $printJoinLgd = 0;
                     }
                 } elseif (!$letter && $cType) {
                     // end of non-word reached
                     return false;
                 }
             }
             $len += $bc;
             // add byte-length of last found character
             if ($str[$pos] == '') {
                 // End of string; return status of string till now
                 return $letter;
             }
             // Get next chars unicode number:
             $cp = $this->utf8_ord($str, $bc, $pos);
             $pos += $bc;
             // Determine the type:
             $cType_prev = $cType;
             list($cType) = $this->charType($cp);
             if ($cType) {
                 continue;
             }
             // Setting letter to FALSE if the first char was not a letter!
             if (!$len) {
                 $letter = false;
             }
         }
         return false;
     }

     public function charType($cp)
     {
         // Numeric?
         if ($cp >= 48 && $cp <= 57) {
             return ['num'];
         }
         // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
         if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
             return ['alpha'];
         }
         // Looking for CJK (Chinese / Japanese / Korean)
         // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/
         // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
         if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
             return ['cjk'];
         }
     }

     public function utf8_ord(&$str, &$len, $pos = 0, $hex = false)
     {
         $ord = ord($str[$pos]);
         $len = 1;
         if ($ord > 128) {
             for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
                 // calculate number of extra bytes
                 $bc++;
             }
             $len += $bc;
             $ord = $ord & (1 << 6 - $bc) - 1;
             // mask utf-8 lead-in bytes
             // "bring in" data bytes
             for ($i = $pos + 1; $bc; $bc--, $i++) {
                 $ord = $ord << 6 | ord($str[$i]) & 63;
             }
         }
         return $hex ? 'x' . dechex($ord) : $ord;
     }
 }
TYPO3

debug
debug($variable='', $name=' *variable *', $line=' *line *', $file=' *file *', $recursiveDepth=3, $debugLevel='E_DEBUG')
Definition: GlobalDebugFunctions.php:14

TYPO3\CMS\IndexedSearch\Lexer\split2Words
split2Words($wordString)
Definition: Lexer.php:73

TYPO3\CMS\IndexedSearch\Lexer\$lexerConf
$lexerConf
Definition: Lexer.php:49

TYPO3\CMS\IndexedSearch\Lexer\$csObj
$csObj
Definition: Lexer.php:42

TYPO3\CMS\Core\Utility\GeneralUtility\makeInstance
static makeInstance($className)
Definition: GeneralUtility.php:4518

$a
$a
Definition: auth_adodb_example.php:19

TYPO3\CMS\IndexedSearch\Lexer\utf8_is_letter
utf8_is_letter(&$str, &$len, $pos=0)
Definition: Lexer.php:189

TYPO3\CMS\IndexedSearch\Lexer\$debugString
$debugString
Definition: Lexer.php:35

TYPO3\CMS\IndexedSearch\Lexer\utf8_ord
utf8_ord(&$str, &$len, $pos=0, $hex=false)
Definition: Lexer.php:291

TYPO3\CMS\IndexedSearch\Lexer\get_word
get_word(&$str, $pos=0)
Definition: Lexer.php:164

TYPO3\CMS\IndexedSearch\Lexer\addWords
addWords(&$words, &$wordString, $start, $len)
Definition: Lexer.php:117

TYPO3\CMS\IndexedSearch\Lexer\__construct
__construct()
Definition: Lexer.php:61

TYPO3\CMS\IndexedSearch

TYPO3\CMS\IndexedSearch\Lexer
Definition: Lexer.php:21

TYPO3\CMS\IndexedSearch\Lexer\$debug
$debug
Definition: Lexer.php:28

TYPO3\CMS\IndexedSearch\Lexer\charType
charType($cp)
Definition: Lexer.php:264