TYPO3 CMS  TYPO3_7-6
Indexer.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
21 
25 class Indexer
26 {
30  public $reasons = [
31  -1 => 'mtime matched the document, so no changes detected and no content updated',
32  -2 => 'The minimum age was not exceeded',
33  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
34  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
35  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
36  4 => 'Page has never been indexed (is not represented in the index_phash table).'
37  ];
38 
44  public $excludeSections = 'script,style';
45 
51  public $external_parsers = [];
52 
60  public $defaultGrList = '0,-1';
61 
67  public $tstamp_maxAge = 0;
68 
75  public $tstamp_minAge = 0;
76 
82  public $maxExternalFiles = 0;
83 
89  public $forceIndexing = false;
90 
96  public $crawlerActive = false;
97 
104  'title' => '',
105  'description' => '',
106  'keywords' => '',
107  'body' => ''
108  ];
109 
113  public $wordcount = 0;
114 
119 
123  public $conf = [];
124 
130  public $indexerConfig = [];
131 
137  public $hash = [];
138 
144  public $file_phash_arr = [];
145 
151  public $contentParts = [];
152 
158  public $content_md5h = '';
159 
163  public $internal_log = [];
164 
171 
175  public $cHashParams = [];
176 
182  public $freqRange = 32000;
183 
187  public $freqMax = 0.1;
188 
192  public $enableMetaphoneSearch = false;
193 
198 
202  public $metaphoneContent = '';
203 
209  public $csObj;
210 
217 
223  public $lexerObj;
224 
228  public $flagBitMask;
229 
236  public function hook_indexContent(&$pObj)
237  {
238  // Indexer configuration from Extension Manager interface:
239  $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
240  // Crawler activation:
241  // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
242  if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
243  // Setting simple log message:
244  $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
245  // Setting variables:
246  $this->crawlerActive = true;
247  // Crawler active flag
248  $this->forceIndexing = true;
249  }
250  // Determine if page should be indexed, and if so, configure and initialize indexer
251  if ($pObj->config['config']['index_enable']) {
252  $this->log_push('Index page', '');
253  if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
254  if (!$pObj->page['no_search']) {
255  if (!$pObj->no_cache) {
256  if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
257  // Setting up internal configuration from config array:
258  $this->conf = [];
259  // Information about page for which the indexing takes place
260  $this->conf['id'] = $pObj->id;
261  // Page id
262  $this->conf['type'] = $pObj->type;
263  // Page type
264  $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
265  // sys_language UID of the language of the indexing.
266  $this->conf['MP'] = $pObj->MP;
267  // MP variable, if any (Mount Points)
268  $this->conf['gr_list'] = $pObj->gr_list;
269  // Group list
270  $this->conf['cHash'] = $pObj->cHash;
271  // cHash string for additional parameters
272  $this->conf['cHash_array'] = $pObj->cHash_array;
273  // Array of the additional parameters
274  $this->conf['crdate'] = $pObj->page['crdate'];
275  // The creation date of the TYPO3 page
276  $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
277  // reg1 of the caching table. Not known what practical use this has.
278  // Root line uids
279  $this->conf['rootline_uids'] = [];
280  foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
281  $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
282  }
283  // Content of page:
284  $this->conf['content'] = $pObj->content;
285  // Content string (HTML of TYPO3 page)
286  $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
287  // Alternative title for indexing
288  $this->conf['metaCharset'] = $pObj->metaCharset;
289  // Character set of content (will be converted to utf-8 during indexing)
290  $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
291  // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
292  // Configuration of behavior:
293  $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
294  // Whether to index external documents like PDF, DOC etc. (if possible)
295  $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
296  // Length of description text (max 250, default 200)
297  $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
298  // Set to zero:
299  $this->conf['recordUid'] = 0;
300  $this->conf['freeIndexUid'] = 0;
301  $this->conf['freeIndexSetId'] = 0;
302  // Init and start indexing:
303  $this->init();
304  $this->indexTypo3PageContent();
305  } else {
306  $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
307  }
308  } else {
309  $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
310  }
311  } else {
312  $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
313  }
314  } else {
315  $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
316  }
317  $this->log_pull();
318  }
319  }
320 
321  /****************************
322  *
323  * Backend API
324  *
325  ****************************/
338  public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
339  {
340  // Setting up internal configuration from config array:
341  $this->conf = [];
342  // Information about page for which the indexing takes place
343  $this->conf['id'] = $id;
344  // Page id (int)
345  $this->conf['type'] = $type;
346  // Page type (int)
347  $this->conf['sys_language_uid'] = $sys_language_uid;
348  // sys_language UID of the language of the indexing (int)
349  $this->conf['MP'] = $MP;
350  // MP variable, if any (Mount Points) (string)
351  $this->conf['gr_list'] = '0,-1';
352  // Group list (hardcoded for now...)
353  // cHash values:
354  if ($createCHash) {
355  /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
356  $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
357  $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
358  } else {
359  $this->conf['cHash'] = '';
360  }
361  // cHash string for additional parameters
362  $this->conf['cHash_array'] = $cHash_array;
363  // Array of the additional parameters
364  // Set to defaults
365  $this->conf['freeIndexUid'] = 0;
366  $this->conf['freeIndexSetId'] = 0;
367  $this->conf['page_cache_reg1'] = 0;
368  // Root line uids
369  $this->conf['rootline_uids'] = $uidRL;
370  // Configuration of behavior:
371  $this->conf['index_externals'] = 1;
372  // Whether to index external documents like PDF, DOC etc. (if possible)
373  $this->conf['index_descrLgd'] = 200;
374  // Length of description text (max 250, default 200)
375  $this->conf['index_metatags'] = true;
376  // Whether to index document keywords and description (if present)
377  // Init and start indexing:
378  $this->init();
379  }
380 
388  public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
389  {
390  $this->conf['freeIndexUid'] = $freeIndexUid;
391  $this->conf['freeIndexSetId'] = $freeIndexSetId;
392  }
393 
407  public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
408  {
409  // Content of page:
410  $this->conf['mtime'] = $mtime;
411  // Most recent modification time (seconds) of the content
412  $this->conf['crdate'] = $crdate;
413  // The creation date of the TYPO3 content
414  $this->conf['recordUid'] = $recordUid;
415  // UID of the record, if applicable
416  // Construct fake HTML for parsing:
417  $this->conf['content'] = '
418  <html>
419  <head>
420  <title>' . htmlspecialchars($title) . '</title>
421  <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
422  <meta name="description" content="' . htmlspecialchars($description) . '" />
423  </head>
424  <body>
425  ' . htmlspecialchars($content) . '
426  </body>
427  </html>';
428  // Content string (HTML of TYPO3 page)
429  // Initializing charset:
430  $this->conf['metaCharset'] = $charset;
431  // Character set of content (will be converted to utf-8 during indexing)
432  $this->conf['indexedDocTitle'] = '';
433  // Alternative title for indexing
434  // Index content as if it was a TYPO3 page:
435  $this->indexTypo3PageContent();
436  }
437 
438  /********************************
439  *
440  * Initialization
441  *
442  *******************************/
448  public function init()
449  {
450  // Initializing:
451  $this->cHashParams = $this->conf['cHash_array'];
452  if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
453  if ($this->conf['cHash']) {
454  // Add this so that URL's come out right...
455  $this->cHashParams['cHash'] = $this->conf['cHash'];
456  }
457  unset($this->cHashParams['encryptionKey']);
458  }
459  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
460  $this->setT3Hashes();
461  // Indexer configuration from Extension Manager interface:
462  $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
463  $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
464  $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
465  $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
466  $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
467  // Workaround: If the extension configuration was not updated yet, the value is not existing
468  $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
469  $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
470  // Initialize external document parsers:
471  // Example configuration, see ext_localconf.php of this file!
472  if ($this->conf['index_externals']) {
473  $this->initializeExternalParsers();
474  }
475  // Initialize lexer (class that deconstructs the text into words):
476  $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
477  $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
478  $this->lexerObj->debug = $this->indexerConfig['debugMode'];
479  // Initialize metaphone hook:
480  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
481  if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
482  $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
483  $this->metaphoneObj->pObj = $this;
484  }
485  // Init charset class:
486  $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
487  }
488 
496  public function initializeExternalParsers()
497  {
498  if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
499  foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
500  $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
501  $this->external_parsers[$extension]->pObj = $this;
502  // Init parser and if it returns FALSE, unset its entry again:
503  if (!$this->external_parsers[$extension]->initParser($extension)) {
504  unset($this->external_parsers[$extension]);
505  }
506  }
507  }
508  }
509 
510  /********************************
511  *
512  * Indexing; TYPO3 pages (HTML content)
513  *
514  *******************************/
520  public function indexTypo3PageContent()
521  {
522  $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
523  $is_grlist = $this->is_grlist_set($this->hash['phash']);
524  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
525  // Setting message:
526  if ($this->forceIndexing) {
527  $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
528  } elseif ($check > 0) {
529  $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
530  } else {
531  $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
532  }
533  // Divide into title,keywords,description and body:
534  $this->log_push('Split content', '');
535  $this->contentParts = $this->splitHTMLContent($this->conf['content']);
536  if ($this->conf['indexedDocTitle']) {
537  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
538  }
539  $this->log_pull();
540  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
541  $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
542  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
543  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
544  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
545  $checkCHash = $this->checkContentHash();
546  if (!is_array($checkCHash) || $check === 1) {
547  $Pstart = GeneralUtility::milliseconds();
548  $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
549  $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
550  $this->log_pull();
551  // Splitting words
552  $this->log_push('Extract words from content', '');
553  $splitInWords = $this->processWordsInArrays($this->contentParts);
554  $this->log_pull();
555  // Analyse the indexed words.
556  $this->log_push('Analyse the extracted words', '');
557  $indexArr = $this->indexAnalyze($splitInWords);
558  $this->log_pull();
559  // Submitting page (phash) record
560  $this->log_push('Submitting page', '');
561  $this->submitPage();
562  $this->log_pull();
563  // Check words and submit to word list if not there
564  $this->log_push('Check word list and submit words', '');
565  if (IndexedSearchUtility::isTableUsed('index_words')) {
566  $this->checkWordList($indexArr);
567  $this->submitWords($indexArr, $this->hash['phash']);
568  }
569  $this->log_pull();
570  // Set parsetime
571  $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
572  // Checking external files if configured for.
573  $this->log_push('Checking external files', '');
574  if ($this->conf['index_externals']) {
575  $this->extractLinks($this->conf['content']);
576  }
577  $this->log_pull();
578  } else {
579  // Update the timestamp
580  $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
581  $this->updateSetId($this->hash['phash']);
582  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
583  $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
584  $this->updateRootline();
585  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
586  }
587  } else {
588  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
589  }
590  }
591 
599  public function splitHTMLContent($content)
600  {
601  // divide head from body ( u-ouh :) )
602  $contentArr = $this->defaultContentArray;
603  $contentArr['body'] = stristr($content, '<body');
604  $headPart = substr($content, 0, -strlen($contentArr['body']));
605  // get title
606  $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
607  $titleParts = explode(':', $contentArr['title'], 2);
608  $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
609  // get keywords and description metatags
610  if ($this->conf['index_metatags']) {
611  $meta = [];
612  $i = 0;
613  while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
614  $i++;
615  }
616  // @todo The code below stops at first unset tag. Is that correct?
617  for ($i = 0; isset($meta[$i]); $i++) {
618  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
619  if (stristr($meta[$i]['name'], 'keywords')) {
620  $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
621  }
622  if (stristr($meta[$i]['name'], 'description')) {
623  $contentArr['description'] .= ',' . $meta[$i]['content'];
624  }
625  }
626  }
627  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
628  $this->typoSearchTags($contentArr['body']);
629  // Get rid of unwanted sections (ie. scripting and style stuff) in body
630  $tagList = explode(',', $this->excludeSections);
631  foreach ($tagList as $tag) {
632  while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
633  }
634  }
635  // remove tags, but first make sure we don't concatenate words by doing it
636  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
637  $contentArr['body'] = trim(strip_tags($contentArr['body']));
638  $contentArr['keywords'] = trim($contentArr['keywords']);
639  $contentArr['description'] = trim($contentArr['description']);
640  // Return array
641  return $contentArr;
642  }
643 
650  public function getHTMLcharset($content)
651  {
652  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
653  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
654  return $reg2[1];
655  }
656  }
657  }
658 
666  public function convertHTMLToUtf8($content, $charset = '')
667  {
668  // Find charset:
669  $charset = $charset ?: $this->getHTMLcharset($content);
670  $charset = $this->csObj->parse_charset($charset);
671  // Convert charset:
672  if ($charset && $charset !== 'utf-8') {
673  $content = $this->csObj->utf8_encode($content, $charset);
674  }
675  // Convert entities, assuming document is now UTF-8:
676  return $this->csObj->entities_to_utf8($content, true);
677  }
678 
691  public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
692  {
693  $endTag = '</' . $tagName . '>';
694  $startTag = '<' . $tagName;
695  // stristr used because we want a case-insensitive search for the tag.
696  $isTagInText = stristr($string, $startTag);
697  // if the tag was not found, return FALSE
698  if (!$isTagInText) {
699  return false;
700  }
701  list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
702  $afterTagInText = stristr($isTagInText, $endTag);
703  if ($afterTagInText) {
704  $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
705  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
706  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
707  } else {
708  $tagContent = '';
709  $stringAfter = $isTagInText;
710  }
711  return true;
712  }
713 
720  public function typoSearchTags(&$body)
721  {
722  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
723  if (count($expBody) > 1) {
724  $body = '';
725  foreach ($expBody as $val) {
726  $part = explode('-->', $val, 2);
727  if (trim($part[0]) == 'begin') {
728  $body .= $part[1];
729  $prev = '';
730  } elseif (trim($part[0]) == 'end') {
731  $body .= $prev;
732  } else {
733  $prev = $val;
734  }
735  }
736  return true;
737  } else {
738  return false;
739  }
740  }
741 
748  public function extractLinks($content)
749  {
750  // Get links:
751  $list = $this->extractHyperLinks($content);
752  if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
753  $this->includeCrawlerClass();
754  $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
755  }
756  // Traverse links:
757  foreach ($list as $linkInfo) {
758  // Decode entities:
759  if ($linkInfo['localPath']) {
760  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
761  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
762  } else {
763  $linkSource = htmlspecialchars_decode($linkInfo['href']);
764  }
765  // Parse URL:
766  $qParts = parse_url($linkSource);
767  // Check for jumpurl (TYPO3 specific thing...)
768  if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
769  parse_str($qParts['query'], $getP);
770  $linkSource = $getP['jumpurl'];
771  $qParts = parse_url($linkSource);
772  }
773  if (!$linkInfo['localPath'] && $qParts['scheme']) {
774  if ($this->indexerConfig['indexExternalURLs']) {
775  // Index external URL (http or otherwise)
776  $this->indexExternalUrl($linkSource);
777  }
778  } elseif (!$qParts['query']) {
779  $linkSource = urldecode($linkSource);
780  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
781  $localFile = $linkSource;
782  } else {
783  $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
784  }
785  if ($localFile && @is_file($localFile)) {
786  // Index local file:
787  if ($linkInfo['localPath']) {
788  $fI = pathinfo($linkSource);
789  $ext = strtolower($fI['extension']);
790  if (is_object($crawler)) {
791  $params = [
792  'document' => $linkSource,
793  'alturl' => $linkInfo['href'],
794  'conf' => $this->conf
795  ];
796  unset($params['conf']['content']);
797  $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
798  $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
799  } else {
800  $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
801  }
802  } else {
803  if (is_object($crawler)) {
804  $params = [
805  'document' => $linkSource,
806  'conf' => $this->conf
807  ];
808  unset($params['conf']['content']);
809  $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
810  $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
811  } else {
812  $this->indexRegularDocument($linkSource);
813  }
814  }
815  }
816  }
817  }
818  }
819 
827  public function extractHyperLinks($html)
828  {
829  $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
830  $htmlParts = $htmlParser->splitTags('a', $html);
831  $hyperLinksData = [];
832  foreach ($htmlParts as $index => $tagData) {
833  if ($index % 2 !== 0) {
834  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
835  $firstTagName = $htmlParser->getFirstTagName($tagData);
836  if (strtolower($firstTagName) === 'a') {
837  if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
838  $hyperLinksData[] = [
839  'tag' => $tagData,
840  'href' => $tagAttributes[0]['href'],
841  'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
842  ];
843  }
844  }
845  }
846  }
847  return $hyperLinksData;
848  }
849 
856  public function extractBaseHref($html)
857  {
858  $href = '';
859  $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
860  $htmlParts = $htmlParser->splitTags('base', $html);
861  foreach ($htmlParts as $index => $tagData) {
862  if ($index % 2 !== 0) {
863  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
864  $firstTagName = $htmlParser->getFirstTagName($tagData);
865  if (strtolower($firstTagName) === 'base') {
866  $href = $tagAttributes[0]['href'];
867  if ($href) {
868  break;
869  }
870  }
871  }
872  }
873  return $href;
874  }
875 
876  /******************************************
877  *
878  * Indexing; external URL
879  *
880  ******************************************/
888  public function indexExternalUrl($externalUrl)
889  {
890  // Parse External URL:
891  $qParts = parse_url($externalUrl);
892  $fI = pathinfo($qParts['path']);
893  $ext = strtolower($fI['extension']);
894  // Get headers:
895  $urlHeaders = $this->getUrlHeaders($externalUrl);
896  if (stristr($urlHeaders['Content-Type'], 'text/html')) {
897  $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
898  if ((string)$content !== '') {
899  // Create temporary file:
900  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
901  if ($tmpFile) {
902  GeneralUtility::writeFile($tmpFile, $content);
903  // Index that file:
904  $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
905  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
906  unlink($tmpFile);
907  }
908  }
909  }
910  }
911 
918  public function getUrlHeaders($url)
919  {
920  // Try to get the headers only
921  $content = GeneralUtility::getUrl($url, 2);
922  if ((string)$content !== '') {
923  // Compile headers:
924  $headers = GeneralUtility::trimExplode(LF, $content, true);
925  $retVal = [];
926  foreach ($headers as $line) {
927  if (trim($line) === '') {
928  break;
929  }
930  list($headKey, $headValue) = explode(':', $line, 2);
931  $retVal[$headKey] = $headValue;
932  }
933  return $retVal;
934  }
935  }
936 
943  protected function createLocalPath($sourcePath)
944  {
945  $localPath = '';
946  static $pathFunctions = [
947  'createLocalPathFromT3vars',
948  'createLocalPathUsingAbsRefPrefix',
949  'createLocalPathUsingDomainURL',
950  'createLocalPathFromAbsoluteURL',
951  'createLocalPathFromRelativeURL'
952  ];
953  foreach ($pathFunctions as $functionName) {
954  $localPath = $this->{$functionName}($sourcePath);
955  if ($localPath != '') {
956  break;
957  }
958  }
959  return $localPath;
960  }
961 
970  protected function createLocalPathFromT3vars($sourcePath)
971  {
972  $localPath = '';
973  $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
974  if (is_array($indexLocalFiles)) {
975  $md5 = GeneralUtility::shortMD5($sourcePath);
976  // Note: not using self::isAllowedLocalFile here because this method
977  // is allowed to index files outside of the web site (for example,
978  // protected downloads)
979  if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
980  $localPath = $indexLocalFiles[$md5];
981  }
982  }
983  return $localPath;
984  }
985 
992  protected function createLocalPathUsingDomainURL($sourcePath)
993  {
994  $localPath = '';
995  $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
996  $baseURLLength = strlen($baseURL);
997  if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
998  $sourcePath = substr($sourcePath, $baseURLLength);
999  $localPath = PATH_site . $sourcePath;
1000  if (!self::isAllowedLocalFile($localPath)) {
1001  $localPath = '';
1002  }
1003  }
1004  return $localPath;
1005  }
1006 
1014  protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1015  {
1016  $localPath = '';
1017  if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1018  $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1019  $absRefPrefixLength = strlen($absRefPrefix);
1020  if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1021  $sourcePath = substr($sourcePath, $absRefPrefixLength);
1022  $localPath = PATH_site . $sourcePath;
1023  if (!self::isAllowedLocalFile($localPath)) {
1024  $localPath = '';
1025  }
1026  }
1027  }
1028  return $localPath;
1029  }
1030 
1038  protected function createLocalPathFromAbsoluteURL($sourcePath)
1039  {
1040  $localPath = '';
1041  if ($sourcePath[0] == '/') {
1042  $sourcePath = substr($sourcePath, 1);
1043  $localPath = PATH_site . $sourcePath;
1044  if (!self::isAllowedLocalFile($localPath)) {
1045  $localPath = '';
1046  }
1047  }
1048  return $localPath;
1049  }
1050 
1057  protected function createLocalPathFromRelativeURL($sourcePath)
1058  {
1059  $localPath = '';
1060  if (self::isRelativeURL($sourcePath)) {
1061  $localPath = PATH_site . $sourcePath;
1062  if (!self::isAllowedLocalFile($localPath)) {
1063  $localPath = '';
1064  }
1065  }
1066  return $localPath;
1067  }
1068 
1075  protected static function isRelativeURL($url)
1076  {
1077  $urlParts = @parse_url($url);
1078  return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1079  }
1080 
1087  protected static function isAllowedLocalFile($filePath)
1088  {
1089  $filePath = GeneralUtility::resolveBackPath($filePath);
1090  $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1091  $isFile = is_file($filePath);
1092  return $insideWebPath && $isFile;
1093  }
1094 
1095  /******************************************
1096  *
1097  * Indexing; external files (PDF, DOC, etc)
1098  *
1099  ******************************************/
1109  public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1110  {
1111  // Init
1112  $fI = pathinfo($file);
1113  $ext = $altExtension ?: strtolower($fI['extension']);
1114  // Create abs-path:
1115  if (!$contentTmpFile) {
1116  if (!GeneralUtility::isAbsPath($file)) {
1117  // Relative, prepend PATH_site:
1118  $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1119  } else {
1120  // Absolute, pass-through:
1121  $absFile = $file;
1122  }
1123  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1124  } else {
1125  $absFile = $contentTmpFile;
1126  }
1127  // Indexing the document:
1128  if ($absFile && @is_file($absFile)) {
1129  if ($this->external_parsers[$ext]) {
1130  $fileInfo = stat($absFile);
1131  $cParts = $this->fileContentParts($ext, $absFile);
1132  foreach ($cParts as $cPKey) {
1133  $this->internal_log = [];
1134  $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1135  $Pstart = GeneralUtility::milliseconds();
1136  $subinfo = ['key' => $cPKey];
1137  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1138  $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1139  $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1140  if ($check > 0 || $force) {
1141  if ($check > 0) {
1142  $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1143  } else {
1144  $this->log_setTSlogMessage('Indexing forced by flag', 1);
1145  }
1146  // Check external file counter:
1147  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1148  // Divide into title,keywords,description and body:
1149  $this->log_push('Split content', '');
1150  $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1151  $this->log_pull();
1152  if (is_array($contentParts)) {
1153  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1155  if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1156  // Increment counter:
1157  $this->externalFileCounter++;
1158  // Splitting words
1159  $this->log_push('Extract words from content', '');
1160  $splitInWords = $this->processWordsInArrays($contentParts);
1161  $this->log_pull();
1162  // Analyse the indexed words.
1163  $this->log_push('Analyse the extracted words', '');
1164  $indexArr = $this->indexAnalyze($splitInWords);
1165  $this->log_pull();
1166  // Submitting page (phash) record
1167  $this->log_push('Submitting page', '');
1168  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1169  $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1170  $this->log_pull();
1171  // Check words and submit to word list if not there
1172  $this->log_push('Check word list and submit words', '');
1173  if (IndexedSearchUtility::isTableUsed('index_words')) {
1174  $this->checkWordList($indexArr);
1175  $this->submitWords($indexArr, $phash_arr['phash']);
1176  }
1177  $this->log_pull();
1178  // Set parsetime
1179  $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1180  } else {
1181  // Update the timestamp
1182  $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1183  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1184  }
1185  } else {
1186  $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1187  }
1188  } else {
1189  $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1190  }
1191  } else {
1192  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1193  }
1194  // Checking and setting sections:
1195  $this->submitFile_section($phash_arr['phash']);
1196  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1197  $this->log_pull();
1198  }
1199  } else {
1200  $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1201  }
1202  } else {
1203  $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1204  }
1205  }
1206 
1216  public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1217  {
1218  $contentArray = null;
1219  // Consult relevant external document parser:
1220  if (is_object($this->external_parsers[$fileExtension])) {
1221  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1222  }
1223  return $contentArray;
1224  }
1225 
1233  public function fileContentParts($ext, $absFile)
1234  {
1235  $cParts = [0];
1236  // Consult relevant external document parser:
1237  if (is_object($this->external_parsers[$ext])) {
1238  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1239  }
1240  return $cParts;
1241  }
1242 
1250  public function splitRegularContent($content)
1251  {
1252  $contentArr = $this->defaultContentArray;
1253  $contentArr['body'] = $content;
1254  return $contentArr;
1255  }
1256 
1257  /**********************************
1258  *
1259  * Analysing content, Extracting words
1260  *
1261  **********************************/
1269  public function charsetEntity2utf8(&$contentArr, $charset)
1270  {
1271  // Convert charset if necessary
1272  foreach ($contentArr as $key => $value) {
1273  if ((string)$contentArr[$key] !== '') {
1274  if ($charset !== 'utf-8') {
1275  $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1276  }
1277  // decode all numeric / html-entities in the string to real characters:
1278  $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], true);
1279  }
1280  }
1281  }
1282 
1289  public function processWordsInArrays($contentArr)
1290  {
1291  // split all parts to words
1292  foreach ($contentArr as $key => $value) {
1293  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1294  }
1295  // For title, keywords, and description we don't want duplicates:
1296  $contentArr['title'] = array_unique($contentArr['title']);
1297  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1298  $contentArr['description'] = array_unique($contentArr['description']);
1299  // Return modified array:
1300  return $contentArr;
1301  }
1302 
1309  public function bodyDescription($contentArr)
1310  {
1311  // Setting description
1312  $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1313  if ($maxL) {
1314  $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1315  // Shorten the string:
1316  $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1317  }
1318  return $bodyDescription;
1319  }
1320 
1327  public function indexAnalyze($content)
1328  {
1329  $indexArr = [];
1330  $counter = 0;
1331  $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1332  $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1333  $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1334  $this->analyzeBody($indexArr, $content);
1335  return $indexArr;
1336  }
1337 
1347  public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1348  {
1349  foreach ($content[$key] as $val) {
1350  $val = substr($val, 0, 60);
1351  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1352  if (!isset($retArr[$val])) {
1353  // Word ID (wid)
1354  $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1355  // Metaphone value is also 60 only chars long
1356  $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1357  $retArr[$val]['metaphone'] = $metaphone;
1358  }
1359  // Build metaphone fulltext string (can be used for fulltext indexing)
1360  if ($this->storeMetaphoneInfoAsWords) {
1361  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1362  }
1363  // Priority used for flagBitMask feature (see extension configuration)
1364  $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1365  // Increase number of occurrences
1366  $retArr[$val]['count']++;
1367  $this->wordcount++;
1368  }
1369  }
1370 
1378  public function analyzeBody(&$retArr, $content)
1379  {
1380  foreach ($content['body'] as $key => $val) {
1381  $val = substr($val, 0, 60);
1382  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1383  if (!isset($retArr[$val])) {
1384  // First occurrence (used for ranking results)
1385  $retArr[$val]['first'] = $key;
1386  // Word ID (wid)
1387  $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1388  // Metaphone value is also only 60 chars long
1389  $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1390  $retArr[$val]['metaphone'] = $metaphone;
1391  }
1392  // Build metaphone fulltext string (can be used for fulltext indexing)
1393  if ($this->storeMetaphoneInfoAsWords) {
1394  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1395  }
1396  // Increase number of occurrences
1397  $retArr[$val]['count']++;
1398  $this->wordcount++;
1399  }
1400  }
1401 
1409  public function metaphone($word, $returnRawMetaphoneValue = false)
1410  {
1411  if (is_object($this->metaphoneObj)) {
1412  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1413  } else {
1414  // Use native PHP function instead of advanced doubleMetaphone class
1415  $metaphoneRawValue = metaphone($word);
1416  }
1417  if ($returnRawMetaphoneValue) {
1418  $result = $metaphoneRawValue;
1419  } elseif ($metaphoneRawValue !== '') {
1420  // Create hash and return integer
1421  $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1422  } else {
1423  $result = 0;
1424  }
1425  return $result;
1426  }
1427 
1428  /********************************
1429  *
1430  * SQL; TYPO3 Pages
1431  *
1432  *******************************/
1438  public function submitPage()
1439  {
1440  // Remove any current data for this phash:
1441  $this->removeOldIndexedPages($this->hash['phash']);
1442  // setting new phash_row
1443  $fields = [
1444  'phash' => $this->hash['phash'],
1445  'phash_grouping' => $this->hash['phash_grouping'],
1446  'cHashParams' => serialize($this->cHashParams),
1447  'contentHash' => $this->content_md5h,
1448  'data_page_id' => $this->conf['id'],
1449  'data_page_reg1' => $this->conf['page_cache_reg1'],
1450  'data_page_type' => $this->conf['type'],
1451  'data_page_mp' => $this->conf['MP'],
1452  'gr_list' => $this->conf['gr_list'],
1453  'item_type' => 0,
1454  // TYPO3 page
1455  'item_title' => $this->contentParts['title'],
1456  'item_description' => $this->bodyDescription($this->contentParts),
1457  'item_mtime' => (int)$this->conf['mtime'],
1458  'item_size' => strlen($this->conf['content']),
1459  'tstamp' => $GLOBALS['EXEC_TIME'],
1460  'crdate' => $GLOBALS['EXEC_TIME'],
1461  'item_crdate' => $this->conf['crdate'],
1462  // Creation date of page
1463  'sys_language_uid' => $this->conf['sys_language_uid'],
1464  // Sys language uid of the page. Should reflect which language it DOES actually display!
1465  'externalUrl' => 0,
1466  'recordUid' => (int)$this->conf['recordUid'],
1467  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1468  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1469  ];
1470  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1471  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1472  }
1473  // PROCESSING index_section
1474  $this->submit_section($this->hash['phash'], $this->hash['phash']);
1475  // PROCESSING index_grlist
1476  $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1477  // PROCESSING index_fulltext
1478  $fields = [
1479  'phash' => $this->hash['phash'],
1480  'fulltextdata' => implode(' ', $this->contentParts),
1481  'metaphonedata' => $this->metaphoneContent
1482  ];
1483  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1484  $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1485  }
1486  if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1487  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1488  }
1489  // PROCESSING index_debug
1490  if ($this->indexerConfig['debugMode']) {
1491  $fields = [
1492  'phash' => $this->hash['phash'],
1493  'debuginfo' => serialize([
1494  'cHashParams' => $this->cHashParams,
1495  'external_parsers initialized' => array_keys($this->external_parsers),
1496  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1497  'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1498  'logs' => $this->internal_log,
1499  'lexer' => $this->lexerObj->debugString
1500  ])
1501  ];
1502  if (IndexedSearchUtility::isTableUsed('index_debug')) {
1503  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1504  }
1505  }
1506  }
1507 
1516  public function submit_grlist($hash, $phash_x)
1517  {
1518  // Setting the gr_list record
1519  $fields = [
1520  'phash' => $hash,
1521  'phash_x' => $phash_x,
1522  'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1523  'gr_list' => $this->conf['gr_list']
1524  ];
1525  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1526  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1527  }
1528  }
1529 
1538  public function submit_section($hash, $hash_t3)
1539  {
1540  $fields = [
1541  'phash' => $hash,
1542  'phash_t3' => $hash_t3,
1543  'page_id' => (int)$this->conf['id']
1544  ];
1545  $this->getRootLineFields($fields);
1546  if (IndexedSearchUtility::isTableUsed('index_section')) {
1547  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1548  }
1549  }
1550 
1557  public function removeOldIndexedPages($phash)
1558  {
1559  // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1560  $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1561  foreach ($tableArray as $table) {
1562  if (IndexedSearchUtility::isTableUsed($table)) {
1563  $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1564  }
1565  }
1566  // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1567  if (IndexedSearchUtility::isTableUsed('index_section')) {
1568  $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . (int)$phash);
1569  }
1570  }
1571 
1572  /********************************
1573  *
1574  * SQL; External media
1575  *
1576  *******************************/
1591  public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1592  {
1593  // Find item Type:
1594  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1595  $storeItemType = $storeItemType ?: $ext;
1596  // Remove any current data for this phash:
1597  $this->removeOldIndexedFiles($hash['phash']);
1598  // Split filename:
1599  $fileParts = parse_url($file);
1600  // Setting new
1601  $fields = [
1602  'phash' => $hash['phash'],
1603  'phash_grouping' => $hash['phash_grouping'],
1604  'cHashParams' => serialize($subinfo),
1605  'contentHash' => $content_md5h,
1606  'data_filename' => $file,
1607  'item_type' => $storeItemType,
1608  'item_title' => trim($contentParts['title']) ?: basename($file),
1609  'item_description' => $this->bodyDescription($contentParts),
1610  'item_mtime' => $mtime,
1611  'item_size' => $size,
1612  'item_crdate' => $ctime,
1613  'tstamp' => $GLOBALS['EXEC_TIME'],
1614  'crdate' => $GLOBALS['EXEC_TIME'],
1615  'gr_list' => $this->conf['gr_list'],
1616  'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1617  'recordUid' => (int)$this->conf['recordUid'],
1618  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1619  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1620  'sys_language_uid' => (int)$this->conf['sys_language_uid']
1621  ];
1622  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1623  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1624  }
1625  // PROCESSING index_fulltext
1626  $fields = [
1627  'phash' => $hash['phash'],
1628  'fulltextdata' => implode(' ', $contentParts),
1629  'metaphonedata' => $this->metaphoneContent
1630  ];
1631  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1632  $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1633  }
1634  if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1635  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1636  }
1637  // PROCESSING index_debug
1638  if ($this->indexerConfig['debugMode']) {
1639  $fields = [
1640  'phash' => $hash['phash'],
1641  'debuginfo' => serialize([
1642  'cHashParams' => $subinfo,
1643  'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1644  'logs' => $this->internal_log,
1645  'lexer' => $this->lexerObj->debugString
1646  ])
1647  ];
1648  if (IndexedSearchUtility::isTableUsed('index_debug')) {
1649  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1650  }
1651  }
1652  }
1653 
1660  public function submitFile_grlist($hash)
1661  {
1662  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1663  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1664  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1665  if ($count == 0) {
1666  $this->submit_grlist($hash, $hash);
1667  }
1668  }
1669  }
1670 
1677  public function submitFile_section($hash)
1678  {
1679  // Testing if there is already a section
1680  if (IndexedSearchUtility::isTableUsed('index_section')) {
1681  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1682  if ($count == 0) {
1683  $this->submit_section($hash, $this->hash['phash']);
1684  }
1685  }
1686  }
1687 
1694  public function removeOldIndexedFiles($phash)
1695  {
1696  // Removing old registrations for tables.
1697  $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1698  foreach ($tableArray as $table) {
1699  if (IndexedSearchUtility::isTableUsed($table)) {
1700  $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1701  }
1702  }
1703  }
1704 
1705  /********************************
1706  *
1707  * SQL Helper functions
1708  *
1709  *******************************/
1718  public function checkMtimeTstamp($mtime, $phash)
1719  {
1720  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1721  // Not indexed (not in index_phash)
1722  $result = 4;
1723  } else {
1724  $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1725  // If there was an indexing of the page...:
1726  if ($row) {
1727  if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1728  // If max age is exceeded, index the page
1729  // The configured max-age was exceeded for the document and thus it's indexed.
1730  $result = 1;
1731  } else {
1732  if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1733  // if minAge is not set or if minAge is exceeded, consider at mtime
1734  if ($mtime) {
1735  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1736  if ($row['item_mtime'] != $mtime) {
1737  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1738  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1739  $result = 2;
1740  } else {
1741  // mtime matched the document, so no changes detected and no content updated
1742  $result = -1;
1743  if ($this->tstamp_maxAge) {
1744  $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1745  } else {
1746  $this->updateTstamp($phash);
1747  $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1748  }
1749  }
1750  } else {
1751  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1752  $result = 3;
1753  }
1754  } else {
1755  // The minimum age was not exceeded
1756  $result = -2;
1757  }
1758  }
1759  } else {
1760  // Page has never been indexed (is not represented in the index_phash table).
1761  $result = 4;
1762  }
1763  }
1764  return $result;
1765  }
1766 
1772  public function checkContentHash()
1773  {
1774  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1775  $result = true;
1776  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1777  $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1778  if ($row) {
1779  $result = $row;
1780  }
1781  }
1782  return $result;
1783  }
1784 
1793  public function checkExternalDocContentHash($hashGr, $content_md5h)
1794  {
1795  $result = true;
1796  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1797  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1798  $result = $count == 0;
1799  }
1800  return $result;
1801  }
1802 
1809  public function is_grlist_set($phash_x)
1810  {
1811  $result = false;
1812  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1813  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1814  $result = $count > 0;
1815  }
1816  return $result;
1817  }
1818 
1827  public function update_grlist($phash, $phash_x)
1828  {
1829  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1830  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1831  if ($count == 0) {
1832  $this->submit_grlist($phash, $phash_x);
1833  $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1834  }
1835  }
1836  }
1837 
1845  public function updateTstamp($phash, $mtime = 0)
1846  {
1847  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1848  $updateFields = [
1849  'tstamp' => $GLOBALS['EXEC_TIME']
1850  ];
1851  if ($mtime) {
1852  $updateFields['item_mtime'] = (int)$mtime;
1853  }
1854  $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1855  }
1856  }
1857 
1864  public function updateSetId($phash)
1865  {
1866  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1867  $updateFields = [
1868  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1869  ];
1870  $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1871  }
1872  }
1873 
1881  public function updateParsetime($phash, $parsetime)
1882  {
1883  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1884  $updateFields = [
1885  'parsetime' => (int)$parsetime
1886  ];
1887  $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1888  }
1889  }
1890 
1896  public function updateRootline()
1897  {
1898  if (IndexedSearchUtility::isTableUsed('index_section')) {
1899  $updateFields = [];
1900  $this->getRootLineFields($updateFields);
1901  $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1902  }
1903  }
1904 
1912  public function getRootLineFields(array &$fieldArray)
1913  {
1914  $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1915  $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1916  $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1917  if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1918  foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1919  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1920  }
1921  }
1922  }
1923 
1929  public function includeCrawlerClass()
1930  {
1931  GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1932  }
1933 
1934  /********************************
1935  *
1936  * SQL; Submitting words
1937  *
1938  *******************************/
1945  public function checkWordList($wordListArray)
1946  {
1947  if (IndexedSearchUtility::isTableUsed('index_words')) {
1948  if (!empty($wordListArray)) {
1949  $phashArray = [];
1950  foreach ($wordListArray as $value) {
1951  $phashArray[] = (int)$value['hash'];
1952  }
1953  $cwl = implode(',', $phashArray);
1954  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1955  $wordListArrayCount = count($wordListArray);
1956  if ($count !== $wordListArrayCount) {
1957  $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1958  $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
1959  while (false != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1960  unset($wordListArray[$row['baseword']]);
1961  }
1962  $GLOBALS['TYPO3_DB']->sql_free_result($res);
1963  foreach ($wordListArray as $key => $val) {
1964  $insertFields = [
1965  'wid' => $val['hash'],
1966  'baseword' => $key,
1967  'metaphone' => $val['metaphone']
1968  ];
1969  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1970  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1971  }
1972  }
1973  }
1974  }
1975  }
1976 
1984  public function submitWords($wordList, $phash)
1985  {
1986  if (IndexedSearchUtility::isTableUsed('index_rel')) {
1987  $stopWords = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('wid', 'index_words', 'is_stopword != 0', '', '', '', 'wid');
1988 
1989  $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . (int)$phash);
1990  $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1991  $rows = [];
1992  foreach ($wordList as $val) {
1993  if (isset($stopWords[$val['hash']])) {
1994  continue;
1995  }
1996  $rows[] = [
1997  (int)$phash,
1998  (int)$val['hash'],
1999  (int)$val['count'],
2000  (int)$val['first'],
2001  $this->freqMap($val['count'] / $this->wordcount),
2002  $val['cmp'] & $this->flagBitMask
2003  ];
2004  }
2005  $GLOBALS['TYPO3_DB']->exec_INSERTmultipleRows('index_rel', $fields, $rows);
2006  }
2007  }
2008 
2016  public function freqMap($freq)
2017  {
2018  $mapFactor = $this->freqMax * 100 * $this->freqRange;
2019  if ($freq <= 1) {
2020  $newFreq = $freq * $mapFactor;
2021  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2022  } else {
2023  $newFreq = $freq / $mapFactor;
2024  }
2025  return $newFreq;
2026  }
2027 
2028  /********************************
2029  *
2030  * Hashing
2031  *
2032  *******************************/
2038  public function setT3Hashes()
2039  {
2040  // Set main array:
2041  $hArray = [
2042  'id' => (int)$this->conf['id'],
2043  'type' => (int)$this->conf['type'],
2044  'sys_lang' => (int)$this->conf['sys_language_uid'],
2045  'MP' => (string)$this->conf['MP'],
2046  'cHash' => $this->cHashParams
2047  ];
2048  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2049  $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2050  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2051  $hArray['gr_list'] = (string)$this->conf['gr_list'];
2052  $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2053  }
2054 
2062  public function setExtHashes($file, $subinfo = [])
2063  {
2064  // Set main array:
2065  $hash = [];
2066  $hArray = [
2067  'file' => $file
2068  ];
2069  // Set grouping hash:
2070  $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2071  // Add subinfo
2072  $hArray['subinfo'] = $subinfo;
2073  $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2074  return $hash;
2075  }
2076 
2077  /*********************************
2078  *
2079  * Internal logging functions
2080  *
2081  *********************************/
2089  public function log_push($msg, $key)
2090  {
2091  if (is_object($GLOBALS['TT'])) {
2092  $GLOBALS['TT']->push($msg, $key);
2093  }
2094  }
2095 
2101  public function log_pull()
2102  {
2103  if (is_object($GLOBALS['TT'])) {
2104  $GLOBALS['TT']->pull();
2105  }
2106  }
2107 
2115  public function log_setTSlogMessage($msg, $errorNum = 0)
2116  {
2117  if (is_object($GLOBALS['TT'])) {
2118  $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2119  }
2120  $this->internal_log[] = $msg;
2121  }
2122 
2131  protected function addSpacesToKeywordList($keywordList)
2132  {
2133  $keywords = GeneralUtility::trimExplode(',', $keywordList);
2134  return ' ' . implode(', ', $keywords) . ' ';
2135  }
2136 }
analyzeHeaderinfo(&$retArr, $content, $key, $offset)
Definition: Indexer.php:1347
submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
Definition: Indexer.php:1591
submit_grlist($hash, $phash_x)
Definition: Indexer.php:1516
setExtHashes($file, $subinfo=[])
Definition: Indexer.php:2062
indexRegularDocument($file, $force=false, $contentTmpFile='', $altExtension='')
Definition: Indexer.php:1109
backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=[], $createCHash=false)
Definition: Indexer.php:338
backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
Definition: Indexer.php:407
static isAllowedLocalFile($filePath)
Definition: Indexer.php:1087
checkExternalDocContentHash($hashGr, $content_md5h)
Definition: Indexer.php:1793
convertHTMLToUtf8($content, $charset='')
Definition: Indexer.php:666
static forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:31
update_grlist($phash, $phash_x)
Definition: Indexer.php:1827
embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
Definition: Indexer.php:691
createLocalPathFromRelativeURL($sourcePath)
Definition: Indexer.php:1057
createLocalPathFromT3vars($sourcePath)
Definition: Indexer.php:970
updateTstamp($phash, $mtime=0)
Definition: Indexer.php:1845
getRootLineFields(array &$fieldArray)
Definition: Indexer.php:1912
static trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
metaphone($word, $returnRawMetaphoneValue=false)
Definition: Indexer.php:1409
addSpacesToKeywordList($keywordList)
Definition: Indexer.php:2131
submitWords($wordList, $phash)
Definition: Indexer.php:1984
fileContentParts($ext, $absFile)
Definition: Indexer.php:1233
updateParsetime($phash, $parsetime)
Definition: Indexer.php:1881
readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
Definition: Indexer.php:1216
static implodeArrayForUrl($name, array $theArray, $str='', $skipBlank=false, $rawurlencodeParamName=false)
static tempnam($filePrefix, $fileSuffix='')
analyzeBody(&$retArr, $content)
Definition: Indexer.php:1378
createLocalPathFromAbsoluteURL($sourcePath)
Definition: Indexer.php:1038
createLocalPathUsingDomainURL($sourcePath)
Definition: Indexer.php:992
static getUrl($url, $includeHeader=0, $requestHeaders=false, &$report=null)
static getFileAbsFileName($filename, $onlyRelative=true, $relToTYPO3_mainDir=false)
charsetEntity2utf8(&$contentArr, $charset)
Definition: Indexer.php:1269
backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
Definition: Indexer.php:388
if(TYPO3_MODE==='BE') $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tsfebeuserauth.php']['frontendEditingController']['default']
static writeFile($file, $content, $changePermissions=false)
checkMtimeTstamp($mtime, $phash)
Definition: Indexer.php:1718
submit_section($hash, $hash_t3)
Definition: Indexer.php:1538
log_setTSlogMessage($msg, $errorNum=0)
Definition: Indexer.php:2115
indexExternalUrl($externalUrl)
Definition: Indexer.php:888
checkWordList($wordListArray)
Definition: Indexer.php:1945
createLocalPathUsingAbsRefPrefix($sourcePath)
Definition: Indexer.php:1014