‪TYPO3CMS  9.5
Indexer.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
17 use Psr\Http\Message\ServerRequestInterface;
33 
38 {
40 
45  protected ‪$deprecatedPublicProperties = [
46  'csObj' => 'Using $csObj within Indexing is discouraged, the property will be removed in TYPO3 v10.0 - if needed instantiate CharsetConverter yourself.',
47  ];
48 
52  public ‪$reasons = [
53  -1 => 'mtime matched the document, so no changes detected and no content updated',
54  -2 => 'The minimum age was not exceeded',
55  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
56  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
57  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
58  4 => 'Page has never been indexed (is not represented in the index_phash table).'
59  ];
60 
66  public ‪$excludeSections = 'script,style';
67 
73  public ‪$external_parsers = [];
74 
82  public ‪$defaultGrList = '0,-1';
83 
89  public ‪$tstamp_maxAge = 0;
90 
97  public ‪$tstamp_minAge = 0;
98 
104  public ‪$maxExternalFiles = 0;
105 
111  public ‪$forceIndexing = false;
112 
118  public ‪$crawlerActive = false;
119 
125  public ‪$defaultContentArray = [
126  'title' => '',
127  'description' => '',
128  'keywords' => '',
129  'body' => ''
130  ];
131 
135  public ‪$wordcount = 0;
136 
140  public ‪$externalFileCounter = 0;
141 
145  public ‪$conf = [];
146 
152  public ‪$indexerConfig = [];
153 
159  public ‪$hash = [];
160 
166  public ‪$file_phash_arr = [];
167 
173  public ‪$contentParts = [];
174 
180  public ‪$content_md5h = '';
181 
185  public ‪$internal_log = [];
186 
192  public ‪$indexExternalUrl_content = '';
193 
199  public ‪$cHashParams = [];
200 
206  public ‪$freqRange = 32000;
207 
211  public ‪$freqMax = 0.1;
212 
216  public ‪$enableMetaphoneSearch = false;
217 
222 
226  public ‪$metaphoneContent = '';
227 
234  protected ‪$csObj;
235 
241  public ‪$metaphoneObj;
242 
248  public ‪$lexerObj;
249 
253  public ‪$flagBitMask;
254 
258  protected ‪$timeTracker;
259 
263  public function ‪__construct()
264  {
265  $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
266  }
267 
273  public function ‪hook_indexContent(&$pObj)
274  {
275  // Indexer configuration from Extension Manager interface:
276  $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
277  // Crawler activation:
278  // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
279  if (\‪TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
280  // Setting simple log message:
281  $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
282  // Setting variables:
283  $this->crawlerActive = true;
284  // Crawler active flag
285  $this->forceIndexing = true;
286  }
287  // Determine if page should be indexed, and if so, configure and initialize indexer
288  if ($pObj->config['config']['index_enable']) {
289  $this->‪log_push('Index page', '');
290  if (!$disableFrontendIndexing || $this->crawlerActive) {
291  if (!$pObj->page['no_search']) {
292  if (!$pObj->no_cache) {
294  $languageAspect = GeneralUtility::makeInstance(Context::class)->getAspect('language');
295  if ($languageAspect->getId() === $languageAspect->getContentId()) {
296  // Setting up internal configuration from config array:
297  $this->conf = [];
298  // Information about page for which the indexing takes place
299  $this->conf['id'] = $pObj->id;
300  // Page id
301  $this->conf['type'] = $pObj->type;
302  // Page type
303  $this->conf['sys_language_uid'] = $languageAspect->getId();
304  // sys_language UID of the language of the indexing.
305  $this->conf['MP'] = $pObj->MP;
306  // MP variable, if any (Mount Points)
307  // Group list
308  $this->conf['gr_list'] = implode(',', GeneralUtility::makeInstance(Context::class)->getPropertyFromAspect('frontend.user', 'groupIds', [0, -1]));
309  // cHash string for additional parameters
310  $this->conf['cHash'] = $pObj->cHash;
311  // cHash array with additional parameters
312  $this->conf['cHash_array'] = $pObj->cHash_array;
313  // page arguments array
314  $this->conf['staticPageArguments'] = [];
316  if (‪$GLOBALS['TYPO3_REQUEST'] instanceof ServerRequestInterface) {
317  $pageArguments = ‪$GLOBALS['TYPO3_REQUEST']->getAttribute('routing', null);
318  if ($pageArguments instanceof PageArguments) {
319  $this->conf['staticPageArguments'] = $pageArguments->getStaticArguments();
320  }
321  }
322  // Array of the additional parameters
323  $this->conf['crdate'] = $pObj->page['crdate'];
324  // The creation date of the TYPO3 page
325 
326  // reg1 of the caching table. Not known what practical use this has.
327  // @deprecated since TYPO3 v9, will be removed in TYPO3 v10.0. Remove along with database field data_page_reg1
328  $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
329 
330  // Root line uids
331  $this->conf['rootline_uids'] = [];
332  foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
333  $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
334  }
335  // Content of page:
336  $this->conf['content'] = $pObj->content;
337  // Content string (HTML of TYPO3 page)
338  $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
339  // Alternative title for indexing
340  $this->conf['metaCharset'] = $pObj->metaCharset;
341  // Character set of content (will be converted to utf-8 during indexing)
342  $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
343  // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
344  // Configuration of behavior:
345  $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
346  // Whether to index external documents like PDF, DOC etc. (if possible)
347  $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
348  // Length of description text (max 250, default 200)
349  $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
350  // Set to zero:
351  $this->conf['recordUid'] = 0;
352  $this->conf['freeIndexUid'] = 0;
353  $this->conf['freeIndexSetId'] = 0;
354  // Init and start indexing:
355  $this->‪init();
357  } else {
358  $this->‪log_setTSlogMessage('Index page? No, languageId was different from contentId which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
359  }
360  } else {
361  $this->‪log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
362  }
363  } else {
364  $this->‪log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
365  }
366  } else {
367  $this->‪log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
368  }
369  $this->‪log_pull();
370  }
371  }
372 
373  /****************************
374  *
375  * Backend API
376  *
377  ****************************/
389  public function ‪backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
390  {
391  // Setting up internal configuration from config array:
392  $this->conf = [];
393  // Information about page for which the indexing takes place
394  $this->conf['id'] = $id;
395  // Page id (int)
396  $this->conf['type'] = $type;
397  // Page type (int)
398  $this->conf['sys_language_uid'] = $sys_language_uid;
399  // sys_language UID of the language of the indexing (int)
400  $this->conf['MP'] = $MP;
401  // MP variable, if any (Mount Points) (string)
402  $this->conf['gr_list'] = '0,-1';
403  // Group list (hardcoded for now...)
404  // cHash values:
405  if ($createCHash) {
406  $cHash_array['id'] = $id;
407  /* @var \TYPO3\CMS\Frontend\Page\CacheHashCalculator $cacheHash */
408  $cacheHash = GeneralUtility::makeInstance(\‪TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
409  $this->conf['cHash'] = $cacheHash->generateForParameters(‪HttpUtility::buildQueryString($cHash_array));
410  } else {
411  $this->conf['cHash'] = '';
412  }
413  // cHash string for additional parameters
414  $this->conf['cHash_array'] = $cHash_array;
415  // Array of the additional parameters
416  // Set to defaults
417  $this->conf['freeIndexUid'] = 0;
418  $this->conf['freeIndexSetId'] = 0;
419 
420  // @deprecated since TYPO3 v9, will be removed in TYPO3 v10.0. Remove along with database field data_page_reg1
421  $this->conf['page_cache_reg1'] = 0;
422 
423  // Root line uids
424  $this->conf['rootline_uids'] = $uidRL;
425  // Configuration of behavior:
426  $this->conf['index_externals'] = 1;
427  // Whether to index external documents like PDF, DOC etc. (if possible)
428  $this->conf['index_descrLgd'] = 200;
429  // Length of description text (max 250, default 200)
430  $this->conf['index_metatags'] = true;
431  // Whether to index document keywords and description (if present)
432  // Init and start indexing:
433  $this->‪init();
434  }
435 
442  public function ‪backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
443  {
444  $this->conf['freeIndexUid'] = $freeIndexUid;
445  $this->conf['freeIndexSetId'] = $freeIndexSetId;
446  }
447 
460  public function ‪backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
461  {
462  // Content of page:
463  $this->conf['mtime'] = $mtime;
464  // Most recent modification time (seconds) of the content
465  $this->conf['crdate'] = $crdate;
466  // The creation date of the TYPO3 content
467  $this->conf['recordUid'] = $recordUid;
468  // UID of the record, if applicable
469  // Construct fake HTML for parsing:
470  $this->conf['content'] = '
471  <html>
472  <head>
473  <title>' . htmlspecialchars($title) . '</title>
474  <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
475  <meta name="description" content="' . htmlspecialchars($description) . '" />
476  </head>
477  <body>
478  ' . htmlspecialchars($content) . '
479  </body>
480  </html>';
481  // Content string (HTML of TYPO3 page)
482  // Initializing charset:
483  $this->conf['metaCharset'] = $charset;
484  // Character set of content (will be converted to utf-8 during indexing)
485  $this->conf['indexedDocTitle'] = '';
486  // Alternative title for indexing
487  // Index content as if it was a TYPO3 page:
488  $this->‪indexTypo3PageContent();
489  }
490 
491  /********************************
492  *
493  * Initialization
494  *
495  *******************************/
499  public function ‪init()
500  {
501  // Initializing:
502  $this->cHashParams = $this->conf['cHash_array'];
503  if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
504  if ($this->conf['cHash']) {
505  // Add this so that URL's come out right...
506  $this->cHashParams['cHash'] = $this->conf['cHash'];
507  }
508  unset($this->cHashParams['encryptionKey']);
509  }
510  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
511  $this->‪setT3Hashes();
512  // Indexer configuration from Extension Manager interface:
513  $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
514  $this->tstamp_minAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
515  $this->tstamp_maxAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
516  $this->maxExternalFiles = ‪MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
517  $this->flagBitMask = ‪MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
518  // Workaround: If the extension configuration was not updated yet, the value is not existing
519  $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
520  $this->storeMetaphoneInfoAsWords = !‪IndexedSearchUtility::isTableUsed('index_words') && ‪$this->enableMetaphoneSearch;
521  // Initialize external document parsers:
522  // Example configuration, see ext_localconf.php of this file!
523  if ($this->conf['index_externals']) {
525  }
526  // Initialize lexer (class that deconstructs the text into words):
527  $lexerObjectClassName = ‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
528  $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
529  $this->lexerObj->debug = $this->indexerConfig['debugMode'];
530  // Initialize metaphone hook:
531  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
532  if ($this->enableMetaphoneSearch && ‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
533  $this->metaphoneObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
534  $this->metaphoneObj->pObj = $this;
535  }
536  // Init charset class:
537  $this->csObj = GeneralUtility::makeInstance(\‪TYPO3\CMS\Core\Charset\CharsetConverter::class);
538  }
539 
546  public function ‪initializeExternalParsers()
547  {
548  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
549  $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
550  $this->external_parsers[$extension]->pObj = $this;
551  // Init parser and if it returns FALSE, unset its entry again:
552  if (!$this->external_parsers[$extension]->initParser($extension)) {
553  unset($this->external_parsers[$extension]);
554  }
555  }
556  }
557 
558  /********************************
559  *
560  * Indexing; TYPO3 pages (HTML content)
561  *
562  *******************************/
566  public function ‪indexTypo3PageContent()
567  {
568  $check = $this->‪checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
569  $is_grlist = $this->‪is_grlist_set($this->hash['phash']);
570  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
571  // Setting message:
572  if ($this->forceIndexing) {
573  $this->‪log_setTSlogMessage('Indexing needed, reason: Forced', 1);
574  } elseif ($check > 0) {
575  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
576  } else {
577  $this->‪log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
578  }
579  // Divide into title,keywords,description and body:
580  $this->‪log_push('Split content', '');
581  $this->contentParts = $this->‪splitHTMLContent($this->conf['content']);
582  if ($this->conf['indexedDocTitle']) {
583  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
584  }
585  $this->‪log_pull();
586  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
587  $this->content_md5h = ‪IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
588  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
589  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
590  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
591  $checkCHash = $this->‪checkContentHash();
592  if (!is_array($checkCHash) || $check === 1) {
593  $Pstart = GeneralUtility::milliseconds();
594  $this->‪log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
595  $this->‪charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
596  $this->‪log_pull();
597  // Splitting words
598  $this->‪log_push('Extract words from content', '');
599  $splitInWords = $this->‪processWordsInArrays($this->contentParts);
600  $this->‪log_pull();
601  // Analyze the indexed words.
602  $this->‪log_push('Analyze the extracted words', '');
603  $indexArr = $this->‪indexAnalyze($splitInWords);
604  $this->‪log_pull();
605  // Submitting page (phash) record
606  $this->‪log_push('Submitting page', '');
607  $this->‪submitPage();
608  $this->‪log_pull();
609  // Check words and submit to word list if not there
610  $this->‪log_push('Check word list and submit words', '');
611  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
612  $this->‪checkWordList($indexArr);
613  $this->‪submitWords($indexArr, $this->hash['phash']);
614  }
615  $this->‪log_pull();
616  // Set parsetime
617  $this->‪updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
618  // Checking external files if configured for.
619  $this->‪log_push('Checking external files', '');
620  if ($this->conf['index_externals']) {
621  $this->‪extractLinks($this->conf['content']);
622  }
623  $this->‪log_pull();
624  } else {
625  // Update the timestamp
626  $this->‪updateTstamp($this->hash['phash'], $this->conf['mtime']);
627  $this->‪updateSetId($this->hash['phash']);
628  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
629  $this->‪update_grlist($checkCHash['phash'], $this->hash['phash']);
630  $this->‪updateRootline();
631  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
632  }
633  } else {
634  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
635  }
636  }
637 
645  public function ‪splitHTMLContent($content)
646  {
647  // divide head from body ( u-ouh :) )
648  $contentArr = ‪$this->defaultContentArray;
649  $contentArr['body'] = stristr($content, '<body');
650  $headPart = substr($content, 0, -strlen($contentArr['body']));
651  // get title
652  $this->‪embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
653  $titleParts = explode(':', $contentArr['title'], 2);
654  $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
655  // get keywords and description metatags
656  if ($this->conf['index_metatags']) {
657  $meta = [];
658  $i = 0;
659  while ($this->‪embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
660  $i++;
661  }
662  // @todo The code below stops at first unset tag. Is that correct?
663  for ($i = 0; isset($meta[$i]); $i++) {
664  // decode HTML entities, meta tag content needs to be encoded later
665  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
666  if (stristr($meta[$i]['name'], 'keywords')) {
667  $contentArr['keywords'] .= ',' . $this->‪addSpacesToKeywordList($meta[$i]['content']);
668  }
669  if (stristr($meta[$i]['name'], 'description')) {
670  $contentArr['description'] .= ',' . $meta[$i]['content'];
671  }
672  }
673  }
674  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
675  $this->‪typoSearchTags($contentArr['body']);
676  // Get rid of unwanted sections (ie. scripting and style stuff) in body
677  $tagList = explode(',', $this->excludeSections);
678  foreach ($tagList as $tag) {
679  while ($this->‪embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
680  }
681  }
682  // remove tags, but first make sure we don't concatenate words by doing it
683  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
684  $contentArr['body'] = trim(strip_tags($contentArr['body']));
685  $contentArr['keywords'] = trim($contentArr['keywords']);
686  $contentArr['description'] = trim($contentArr['description']);
687  // Return array
688  return $contentArr;
689  }
690 
697  public function ‪getHTMLcharset($content)
698  {
699  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
700  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
701  return $reg2[1];
702  }
703  }
704  }
705 
713  public function ‪convertHTMLToUtf8($content, $charset = '')
714  {
715  // Find charset:
716  $charset = $charset ?: $this->‪getHTMLcharset($content);
717  $charset = trim(strtolower($charset));
718  // Convert charset:
719  if ($charset && $charset !== 'utf-8') {
720  $content = mb_convert_encoding($content, 'utf-8', $charset);
721  }
722  // Convert entities, assuming document is now UTF-8:
723  return html_entity_decode($content);
724  }
725 
738  public function ‪embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
739  {
740  $endTag = '</' . $tagName . '>';
741  $startTag = '<' . $tagName;
742  // stristr used because we want a case-insensitive search for the tag.
743  $isTagInText = stristr($string, $startTag);
744  // if the tag was not found, return FALSE
745  if (!$isTagInText) {
746  return false;
747  }
748  list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
749  $afterTagInText = stristr($isTagInText, $endTag);
750  if ($afterTagInText) {
751  $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
752  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
753  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
754  } else {
755  $tagContent = '';
756  $stringAfter = $isTagInText;
757  }
758  return true;
759  }
760 
767  public function ‪typoSearchTags(&$body)
768  {
769  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
770  if (count($expBody) > 1) {
771  $body = '';
772  foreach ($expBody as $val) {
773  $part = explode('-->', $val, 2);
774  if (trim($part[0]) === 'begin') {
775  $body .= $part[1];
776  $prev = '';
777  } elseif (trim($part[0]) === 'end') {
778  $body .= $prev;
779  } else {
780  $prev = $val;
781  }
782  }
783  return true;
784  }
785  return false;
786  }
787 
793  public function ‪extractLinks($content)
794  {
795  // Get links:
796  $list = $this->‪extractHyperLinks($content);
797  if ($this->indexerConfig['useCrawlerForExternalFiles'] && \‪TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
798  $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
799  }
800  // Traverse links:
801  foreach ($list as $linkInfo) {
802  // Decode entities:
803  if ($linkInfo['localPath']) {
804  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
805  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
806  } else {
807  $linkSource = htmlspecialchars_decode($linkInfo['href']);
808  }
809  // Parse URL:
810  $qParts = parse_url($linkSource);
811  // Check for jumpurl (TYPO3 specific thing...)
812  if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
813  parse_str($qParts['query'], $getP);
814  $linkSource = $getP['jumpurl'];
815  $qParts = parse_url($linkSource);
816  }
817  if (!$linkInfo['localPath'] && $qParts['scheme']) {
818  if ($this->indexerConfig['indexExternalURLs']) {
819  // Index external URL (http or otherwise)
820  $this->‪indexExternalUrl($linkSource);
821  }
822  } elseif (!$qParts['query']) {
823  $linkSource = urldecode($linkSource);
824  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
825  $localFile = $linkSource;
826  } else {
827  $localFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $linkSource);
828  }
829  if ($localFile && @is_file($localFile)) {
830  // Index local file:
831  if ($linkInfo['localPath']) {
832  $fI = pathinfo($linkSource);
833  $ext = strtolower($fI['extension']);
834  if (is_object($crawler)) {
835  $params = [
836  'document' => $linkSource,
837  'alturl' => $linkInfo['href'],
838  'conf' => ‪$this->conf
839  ];
840  unset($params['conf']['content']);
841  $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
842  $this->‪log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
843  } else {
844  $this->‪indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
845  }
846  } else {
847  if (is_object($crawler)) {
848  $params = [
849  'document' => $linkSource,
850  'conf' => ‪$this->conf
851  ];
852  unset($params['conf']['content']);
853  $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
854  $this->‪log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
855  } else {
856  $this->‪indexRegularDocument($linkSource);
857  }
858  }
859  }
860  }
861  }
862  }
863 
871  public function ‪extractHyperLinks($html)
872  {
873  $htmlParser = GeneralUtility::makeInstance(\‪TYPO3\CMS\Core\Html\HtmlParser::class);
874  $htmlParts = $htmlParser->splitTags('a', $html);
875  $hyperLinksData = [];
876  foreach ($htmlParts as $index => $tagData) {
877  if ($index % 2 !== 0) {
878  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
879  $firstTagName = $htmlParser->getFirstTagName($tagData);
880  if (strtolower($firstTagName) === 'a') {
881  if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
882  $hyperLinksData[] = [
883  'tag' => $tagData,
884  'href' => $tagAttributes[0]['href'],
885  'localPath' => $this->‪createLocalPath(urldecode($tagAttributes[0]['href']))
886  ];
887  }
888  }
889  }
890  }
891  return $hyperLinksData;
892  }
893 
900  public function ‪extractBaseHref($html)
901  {
902  $href = '';
903  $htmlParser = GeneralUtility::makeInstance(\‪TYPO3\CMS\Core\Html\HtmlParser::class);
904  $htmlParts = $htmlParser->splitTags('base', $html);
905  foreach ($htmlParts as $index => $tagData) {
906  if ($index % 2 !== 0) {
907  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
908  $firstTagName = $htmlParser->getFirstTagName($tagData);
909  if (strtolower($firstTagName) === 'base') {
910  $href = $tagAttributes[0]['href'];
911  if ($href) {
912  break;
913  }
914  }
915  }
916  }
917  return $href;
918  }
919 
920  /******************************************
921  *
922  * Indexing; external URL
923  *
924  ******************************************/
931  public function ‪indexExternalUrl($externalUrl)
932  {
933  // Get headers:
934  $urlHeaders = $this->‪getUrlHeaders($externalUrl);
935  if (stristr($urlHeaders['Content-Type'], 'text/html')) {
936  $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
937  if ((string)$content !== '') {
938  // Create temporary file:
939  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
940  if ($tmpFile) {
941  GeneralUtility::writeFile($tmpFile, $content);
942  // Index that file:
943  $this->‪indexRegularDocument($externalUrl, true, $tmpFile, 'html');
944  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
945  unlink($tmpFile);
946  }
947  }
948  }
949  }
950 
957  public function ‪getUrlHeaders($url)
958  {
959  // Try to get the headers only
960  $content = GeneralUtility::getUrl($url, 2);
961  if ((string)$content !== '') {
962  // Compile headers:
963  $headers = GeneralUtility::trimExplode(LF, $content, true);
964  $retVal = [];
965  foreach ($headers as $line) {
966  if (trim($line) === '') {
967  break;
968  }
969  list($headKey, $headValue) = explode(':', $line, 2);
970  $retVal[$headKey] = $headValue;
971  }
972  return $retVal;
973  }
974  }
975 
982  protected function ‪createLocalPath($sourcePath)
983  {
984  $localPath = '';
985  $pathFunctions = [
986  'createLocalPathFromT3vars',
987  'createLocalPathUsingAbsRefPrefix',
988  'createLocalPathUsingDomainURL',
989  'createLocalPathFromAbsoluteURL',
990  'createLocalPathFromRelativeURL'
991  ];
992  foreach ($pathFunctions as $functionName) {
993  $localPath = $this->{$functionName}($sourcePath);
994  if ($localPath != '') {
995  break;
996  }
997  }
998  return $localPath;
999  }
1000 
1009  protected function ‪createLocalPathFromT3vars($sourcePath)
1010  {
1011  $localPath = '';
1012  $indexLocalFiles = ‪$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] ?? null;
1013  if (is_array($indexLocalFiles)) {
1014  $md5 = GeneralUtility::shortMD5($sourcePath);
1015  // Note: not using self::isAllowedLocalFile here because this method
1016  // is allowed to index files outside of the web site (for example,
1017  // protected downloads)
1018  if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
1019  $localPath = $indexLocalFiles[$md5];
1020  }
1021  }
1022  return $localPath;
1023  }
1024 
1031  protected function ‪createLocalPathUsingDomainURL($sourcePath)
1032  {
1033  $localPath = '';
1034  $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1035  $baseURLLength = strlen($baseURL);
1036  if (strpos($sourcePath, $baseURL) === 0) {
1037  $sourcePath = substr($sourcePath, $baseURLLength);
1038  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
1039  if (!self::isAllowedLocalFile($localPath)) {
1040  $localPath = '';
1041  }
1042  }
1043  return $localPath;
1044  }
1045 
1053  protected function ‪createLocalPathUsingAbsRefPrefix($sourcePath)
1054  {
1055  $localPath = '';
1056  if (isset(‪$GLOBALS['TSFE']) && ‪$GLOBALS['TSFE'] instanceof ‪TypoScriptFrontendController) {
1057  $absRefPrefix = ‪$GLOBALS['TSFE']->config['config']['absRefPrefix'];
1058  $absRefPrefixLength = strlen($absRefPrefix);
1059  if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
1060  $sourcePath = substr($sourcePath, $absRefPrefixLength);
1061  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
1062  if (!self::isAllowedLocalFile($localPath)) {
1063  $localPath = '';
1064  }
1065  }
1066  }
1067  return $localPath;
1068  }
1069 
1077  protected function ‪createLocalPathFromAbsoluteURL($sourcePath)
1078  {
1079  $localPath = '';
1080  if ($sourcePath[0] === '/') {
1081  $sourcePath = substr($sourcePath, 1);
1082  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
1083  if (!self::isAllowedLocalFile($localPath)) {
1084  $localPath = '';
1085  }
1086  }
1087  return $localPath;
1088  }
1089 
1096  protected function ‪createLocalPathFromRelativeURL($sourcePath)
1097  {
1098  $localPath = '';
1099  if (self::isRelativeURL($sourcePath)) {
1100  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
1101  if (!self::isAllowedLocalFile($localPath)) {
1102  $localPath = '';
1103  }
1104  }
1105  return $localPath;
1106  }
1107 
1114  protected static function ‪isRelativeURL($url)
1115  {
1116  $urlParts = @parse_url($url);
1117  return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
1118  }
1119 
1126  protected static function ‪isAllowedLocalFile($filePath)
1127  {
1128  $filePath = GeneralUtility::resolveBackPath($filePath);
1129  $insideWebPath = strpos($filePath, ‪Environment::getPublicPath()) === 0;
1130  $isFile = is_file($filePath);
1131  return $insideWebPath && $isFile;
1132  }
1133 
1134  /******************************************
1135  *
1136  * Indexing; external files (PDF, DOC, etc)
1137  *
1138  ******************************************/
1147  public function ‪indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1148  {
1149  // Init
1150  $fI = pathinfo($file);
1151  $ext = $altExtension ?: strtolower($fI['extension']);
1152  // Create abs-path:
1153  if (!$contentTmpFile) {
1154  if (!GeneralUtility::isAbsPath($file)) {
1155  // Relative, prepend public web path:
1156  $absFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $file);
1157  } else {
1158  // Absolute, pass-through:
1159  $absFile = $file;
1160  }
1161  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1162  } else {
1163  $absFile = $contentTmpFile;
1164  }
1165  // Indexing the document:
1166  if ($absFile && @is_file($absFile)) {
1167  if ($this->external_parsers[$ext]) {
1168  $fileInfo = stat($absFile);
1169  $cParts = $this->‪fileContentParts($ext, $absFile);
1170  foreach ($cParts as $cPKey) {
1171  $this->internal_log = [];
1172  $this->‪log_push('Index: ' . str_replace('.', '_', ‪PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1173  $Pstart = GeneralUtility::milliseconds();
1174  $subinfo = ['key' => $cPKey];
1175  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1176  $phash_arr = ($this->file_phash_arr = $this->‪setExtHashes($file, $subinfo));
1177  $check = $this->‪checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1178  if ($check > 0 || $force) {
1179  if ($check > 0) {
1180  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1181  } else {
1182  $this->‪log_setTSlogMessage('Indexing forced by flag', 1);
1183  }
1184  // Check external file counter:
1185  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1186  // Divide into title,keywords,description and body:
1187  $this->‪log_push('Split content', '');
1188  ‪$contentParts = $this->‪readFileContent($ext, $absFile, $cPKey);
1189  $this->‪log_pull();
1190  if (is_array(‪$contentParts)) {
1191  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1193  if ($this->‪checkExternalDocContentHash($phash_arr['phash_grouping'], ‪$content_md5h) || $force) {
1194  // Increment counter:
1195  $this->externalFileCounter++;
1196  // Splitting words
1197  $this->‪log_push('Extract words from content', '');
1198  $splitInWords = $this->‪processWordsInArrays(‪$contentParts);
1199  $this->‪log_pull();
1200  // Analyze the indexed words.
1201  $this->‪log_push('Analyze the extracted words', '');
1202  $indexArr = $this->‪indexAnalyze($splitInWords);
1203  $this->‪log_pull();
1204  // Submitting page (phash) record
1205  $this->‪log_push('Submitting page', '');
1206  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1207  $this->‪submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], ‪$content_md5h, ‪$contentParts);
1208  $this->‪log_pull();
1209  // Check words and submit to word list if not there
1210  $this->‪log_push('Check word list and submit words', '');
1211  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
1212  $this->‪checkWordList($indexArr);
1213  $this->‪submitWords($indexArr, $phash_arr['phash']);
1214  }
1215  $this->‪log_pull();
1216  // Set parsetime
1217  $this->‪updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1218  } else {
1219  // Update the timestamp
1220  $this->‪updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1221  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . ‪$content_md5h . ', has not changed. Timestamp updated.');
1222  }
1223  } else {
1224  $this->‪log_setTSlogMessage('Could not index file! Unsupported extension.');
1225  }
1226  } else {
1227  $this->‪log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1228  }
1229  } else {
1230  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1231  }
1232  // Checking and setting sections:
1233  $this->‪submitFile_section($phash_arr['phash']);
1234  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1235  $this->‪log_pull();
1236  }
1237  } else {
1238  $this->‪log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1239  }
1240  } else {
1241  $this->‪log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1242  }
1243  }
1244 
1254  public function ‪readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1255  {
1256  $contentArray = null;
1257  // Consult relevant external document parser:
1258  if (is_object($this->external_parsers[$fileExtension])) {
1259  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1260  }
1261  return $contentArray;
1262  }
1263 
1271  public function ‪fileContentParts($ext, $absFile)
1272  {
1273  $cParts = [0];
1274  // Consult relevant external document parser:
1275  if (is_object($this->external_parsers[$ext])) {
1276  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1277  }
1278  return $cParts;
1279  }
1280 
1288  public function ‪splitRegularContent($content)
1289  {
1290  $contentArr = ‪$this->defaultContentArray;
1291  $contentArr['body'] = $content;
1292  return $contentArr;
1293  }
1294 
1295  /**********************************
1296  *
1297  * Analysing content, Extracting words
1298  *
1299  **********************************/
1306  public function ‪charsetEntity2utf8(&$contentArr, $charset)
1307  {
1308  // Convert charset if necessary
1309  foreach ($contentArr as $key => $value) {
1310  if ((string)$contentArr[$key] !== '') {
1311  if ($charset !== 'utf-8') {
1312  $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1313  }
1314  // decode all numeric / html-entities in the string to real characters:
1315  $contentArr[$key] = html_entity_decode($contentArr[$key]);
1316  }
1317  }
1318  }
1319 
1326  public function ‪processWordsInArrays($contentArr)
1327  {
1328  // split all parts to words
1329  foreach ($contentArr as $key => $value) {
1330  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1331  }
1332  // For title, keywords, and description we don't want duplicates:
1333  $contentArr['title'] = array_unique($contentArr['title']);
1334  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1335  $contentArr['description'] = array_unique($contentArr['description']);
1336  // Return modified array:
1337  return $contentArr;
1338  }
1339 
1346  public function ‪bodyDescription($contentArr)
1347  {
1348  // Setting description
1349  $maxL = ‪MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1350  if ($maxL) {
1351  $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1352  // Shorten the string:
1353  $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1354  }
1355  return $bodyDescription;
1356  }
1357 
1364  public function ‪indexAnalyze($content)
1365  {
1366  $indexArr = [];
1367  $this->‪analyzeHeaderinfo($indexArr, $content, 'title', 7);
1368  $this->‪analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1369  $this->‪analyzeHeaderinfo($indexArr, $content, 'description', 5);
1370  $this->‪analyzeBody($indexArr, $content);
1371  return $indexArr;
1372  }
1373 
1382  public function ‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1383  {
1384  foreach ($content[$key] as $val) {
1385  $val = substr($val, 0, 60);
1386  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1387  if (!isset($retArr[$val])) {
1388  // Word ID (wid)
1389  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1390  // Metaphone value is also 60 only chars long
1391  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1392  $retArr[$val]['metaphone'] = $metaphone;
1393  }
1394  // Build metaphone fulltext string (can be used for fulltext indexing)
1395  if ($this->storeMetaphoneInfoAsWords) {
1396  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1397  }
1398  // Priority used for flagBitMask feature (see extension configuration)
1399  $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1400  // Increase number of occurrences
1401  $retArr[$val]['count']++;
1402  $this->wordcount++;
1403  }
1404  }
1405 
1412  public function ‪analyzeBody(&$retArr, $content)
1413  {
1414  foreach ($content['body'] as $key => $val) {
1415  $val = substr($val, 0, 60);
1416  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1417  if (!isset($retArr[$val])) {
1418  // First occurrence (used for ranking results)
1419  $retArr[$val]['first'] = $key;
1420  // Word ID (wid)
1421  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1422  // Metaphone value is also only 60 chars long
1423  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1424  $retArr[$val]['metaphone'] = $metaphone;
1425  }
1426  // Build metaphone fulltext string (can be used for fulltext indexing)
1427  if ($this->storeMetaphoneInfoAsWords) {
1428  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1429  }
1430  // Increase number of occurrences
1431  $retArr[$val]['count']++;
1432  $this->wordcount++;
1433  }
1434  }
1435 
1443  public function ‪metaphone($word, $returnRawMetaphoneValue = false)
1444  {
1445  if (is_object($this->metaphoneObj)) {
1446  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1447  } else {
1448  // Use native PHP function instead of advanced doubleMetaphone class
1449  $metaphoneRawValue = ‪metaphone($word);
1450  }
1451  if ($returnRawMetaphoneValue) {
1452  $result = $metaphoneRawValue;
1453  } elseif ($metaphoneRawValue !== '') {
1454  // Create hash and return integer
1455  $result = ‪IndexedSearchUtility::md5inthash($metaphoneRawValue);
1456  } else {
1457  $result = 0;
1458  }
1459  return $result;
1460  }
1461 
1462  /********************************
1463  *
1464  * SQL; TYPO3 Pages
1465  *
1466  *******************************/
1470  public function ‪submitPage()
1471  {
1472  // Remove any current data for this phash:
1473  $this->‪removeOldIndexedPages($this->hash['phash']);
1474  // setting new phash_row
1475  ‪$fields = [
1476  'phash' => $this->hash['phash'],
1477  'phash_grouping' => $this->hash['phash_grouping'],
1478  'cHashParams' => serialize($this->cHashParams),
1479  'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1480  'contentHash' => $this->content_md5h,
1481  'data_page_id' => $this->conf['id'],
1482  // @deprecated since TYPO3 v9, will be removed in TYPO3 v10.0. Remove along with database field data_page_reg1
1483  'data_page_reg1' => $this->conf['page_cache_reg1'],
1484  'data_page_type' => $this->conf['type'],
1485  'data_page_mp' => $this->conf['MP'],
1486  'gr_list' => $this->conf['gr_list'],
1487  'item_type' => 0,
1488  // TYPO3 page
1489  'item_title' => $this->contentParts['title'],
1490  'item_description' => $this->‪bodyDescription($this->contentParts),
1491  'item_mtime' => (int)$this->conf['mtime'],
1492  'item_size' => strlen($this->conf['content']),
1493  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1494  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1495  'item_crdate' => $this->conf['crdate'],
1496  // Creation date of page
1497  'sys_language_uid' => $this->conf['sys_language_uid'],
1498  // Sys language uid of the page. Should reflect which language it DOES actually display!
1499  'externalUrl' => 0,
1500  'recordUid' => (int)$this->conf['recordUid'],
1501  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1502  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1503  ];
1504  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1505  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1506  ->getConnectionForTable('index_phash');
1507  $connection->insert(
1508  'index_phash',
1509  ‪$fields,
1510  ['cHashParams' => ‪Connection::PARAM_LOB]
1511  );
1512  }
1513  // PROCESSING index_section
1514  $this->‪submit_section($this->hash['phash'], $this->hash['phash']);
1515  // PROCESSING index_grlist
1516  $this->‪submit_grlist($this->hash['phash'], $this->hash['phash']);
1517  // PROCESSING index_fulltext
1518  ‪$fields = [
1519  'phash' => $this->hash['phash'],
1520  'fulltextdata' => implode(' ', $this->contentParts),
1521  'metaphonedata' => ‪$this->metaphoneContent
1522  ];
1523  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1524  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1525  }
1526  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1527  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1528  ->getConnectionForTable('index_fulltext');
1529  $connection->insert('index_fulltext', ‪$fields);
1530  }
1531  // PROCESSING index_debug
1532  if ($this->indexerConfig['debugMode']) {
1533  ‪$fields = [
1534  'phash' => $this->hash['phash'],
1535  'debuginfo' => serialize([
1536  'cHashParams' => $this->cHashParams,
1537  'external_parsers initialized' => array_keys($this->external_parsers),
1538  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1539  'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1540  'logs' => $this->internal_log,
1541  'lexer' => $this->lexerObj->debugString
1542  ])
1543  ];
1544  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1545  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1546  ->getConnectionForTable('index_debug');
1547  $connection->insert('index_debug', ‪$fields);
1548  }
1549  }
1550  }
1551 
1559  public function ‪submit_grlist(‪$hash, $phash_x)
1560  {
1561  // Setting the gr_list record
1562  ‪$fields = [
1563  'phash' => ‪$hash,
1564  'phash_x' => $phash_x,
1565  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1566  'gr_list' => $this->conf['gr_list']
1567  ];
1568  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1569  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1570  ->getConnectionForTable('index_grlist');
1571  $connection->insert('index_grlist', ‪$fields);
1572  }
1573  }
1574 
1582  public function ‪submit_section(‪$hash, $hash_t3)
1583  {
1584  ‪$fields = [
1585  'phash' => ‪$hash,
1586  'phash_t3' => $hash_t3,
1587  'page_id' => (int)$this->conf['id']
1588  ];
1590  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1591  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1592  ->getConnectionForTable('index_section');
1593  $connection->insert('index_section', ‪$fields);
1594  }
1595  }
1596 
1602  public function ‪removeOldIndexedPages($phash)
1603  {
1604  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1605  // there can be nothing else than 1-1 relations here.
1606  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1607  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1608  foreach ($tableArray as $table) {
1610  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1611  }
1612  }
1613 
1614  // Removing all index_section records with hash_t3 set to this hash (this includes such
1615  // records set for external media on the page as well!). The re-insert of these records
1616  // are done in indexRegularDocument($file).
1617  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1618  $connectionPool->getConnectionForTable('index_section')
1619  ->delete('index_section', ['phash_t3' => (int)$phash]);
1620  }
1621  }
1622 
1623  /********************************
1624  *
1625  * SQL; External media
1626  *
1627  *******************************/
1641  public function ‪submitFilePage(‪$hash, $file, $subinfo, $ext, $mtime, $ctime, $size, ‪$content_md5h, ‪$contentParts)
1642  {
1643  // Find item Type:
1644  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1645  $storeItemType = $storeItemType ?: $ext;
1646  // Remove any current data for this phash:
1647  $this->‪removeOldIndexedFiles(‪$hash['phash']);
1648  // Split filename:
1649  $fileParts = parse_url($file);
1650  // Setting new
1651  ‪$fields = [
1652  'phash' => ‪$hash['phash'],
1653  'phash_grouping' => ‪$hash['phash_grouping'],
1654  'cHashParams' => serialize($subinfo),
1655  'contentHash' => ‪$content_md5h,
1656  'data_filename' => $file,
1657  'item_type' => $storeItemType,
1658  'item_title' => trim(‪$contentParts['title']) ?: ‪PathUtility::basename($file),
1659  'item_description' => $this->‪bodyDescription(‪$contentParts),
1660  'item_mtime' => $mtime,
1661  'item_size' => $size,
1662  'item_crdate' => $ctime,
1663  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1664  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1665  'gr_list' => $this->conf['gr_list'],
1666  'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1667  'recordUid' => (int)$this->conf['recordUid'],
1668  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1669  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1670  'sys_language_uid' => (int)$this->conf['sys_language_uid']
1671  ];
1672  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1673  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1674  ->getConnectionForTable('index_phash');
1675  $connection->insert(
1676  'index_phash',
1677  ‪$fields,
1678  ['cHashParams' => ‪Connection::PARAM_LOB]
1679  );
1680  }
1681  // PROCESSING index_fulltext
1682  ‪$fields = [
1683  'phash' => ‪$hash['phash'],
1684  'fulltextdata' => implode(' ', ‪$contentParts),
1685  'metaphonedata' => ‪$this->metaphoneContent
1686  ];
1687  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1688  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1689  }
1690  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1691  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1692  ->getConnectionForTable('index_fulltext');
1693  $connection->insert('index_fulltext', ‪$fields);
1694  }
1695  // PROCESSING index_debug
1696  if ($this->indexerConfig['debugMode']) {
1697  ‪$fields = [
1698  'phash' => ‪$hash['phash'],
1699  'debuginfo' => serialize([
1700  'cHashParams' => $subinfo,
1701  'contentParts' => array_merge(‪$contentParts, ['body' => substr(‪$contentParts['body'], 0, 1000)]),
1702  'logs' => $this->internal_log,
1703  'lexer' => $this->lexerObj->debugString
1704  ])
1705  ];
1706  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1707  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1708  ->getConnectionForTable('index_debug');
1709  $connection->insert('index_debug', ‪$fields);
1710  }
1711  }
1712  }
1713 
1719  public function ‪submitFile_grlist(‪$hash)
1720  {
1721  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1722  if (!‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1723  return;
1724  }
1725 
1726  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1727  ->getQueryBuilderForTable('index_grlist');
1728  $count = (int)$queryBuilder->count('*')
1729  ->from('index_grlist')
1730  ->where(
1731  $queryBuilder->expr()->eq(
1732  'phash',
1733  $queryBuilder->createNamedParameter(‪$hash, \PDO::PARAM_INT)
1734  ),
1735  $queryBuilder->expr()->orX(
1736  $queryBuilder->expr()->eq(
1737  'hash_gr_list',
1738  $queryBuilder->createNamedParameter(
1739  ‪IndexedSearchUtility::md5inthash($this->defaultGrList),
1740  \PDO::PARAM_INT
1741  )
1742  ),
1743  $queryBuilder->expr()->eq(
1744  'hash_gr_list',
1745  $queryBuilder->createNamedParameter(
1746  ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1747  \PDO::PARAM_INT
1748  )
1749  )
1750  )
1751  )
1752  ->execute()
1753  ->fetchColumn();
1754 
1755  if ($count === 0) {
1757  }
1758  }
1759 
1765  public function ‪submitFile_section(‪$hash)
1766  {
1767  // Testing if there is already a section
1768  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1769  return;
1770  }
1771 
1772  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1773  ->getQueryBuilderForTable('index_section');
1774  $count = (int)$queryBuilder->count('phash')
1775  ->from('index_section')
1776  ->where(
1777  $queryBuilder->expr()->eq(
1778  'phash',
1779  $queryBuilder->createNamedParameter(‪$hash, \PDO::PARAM_INT)
1780  ),
1781  $queryBuilder->expr()->eq(
1782  'page_id',
1783  $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1784  )
1785  )
1786  ->execute()
1787  ->fetchColumn();
1788 
1789  if ($count === 0) {
1790  $this->‪submit_section(‪$hash, $this->hash['phash']);
1791  }
1792  }
1793 
1799  public function ‪removeOldIndexedFiles($phash)
1800  {
1801  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1802  // Removing old registrations for tables.
1803  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1804  foreach ($tableArray as $table) {
1805  if (!‪IndexedSearchUtility::isTableUsed($table)) {
1806  continue;
1807  }
1808  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1809  }
1810  }
1811 
1812  /********************************
1813  *
1814  * SQL Helper functions
1815  *
1816  *******************************/
1825  public function ‪checkMtimeTstamp($mtime, $phash)
1826  {
1827  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1828  // Not indexed (not in index_phash)
1829  $result = 4;
1830  } else {
1831  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1832  ->select(
1833  ['item_mtime', 'tstamp'],
1834  'index_phash',
1835  ['phash' => (int)$phash],
1836  [],
1837  [],
1838  1
1839  )
1840  ->fetch();
1841  // If there was an indexing of the page...:
1842  if (!empty($row)) {
1843  if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < ‪$GLOBALS['EXEC_TIME']) {
1844  // If max age is exceeded, index the page
1845  // The configured max-age was exceeded for the document and thus it's indexed.
1846  $result = 1;
1847  } else {
1848  if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < ‪$GLOBALS['EXEC_TIME']) {
1849  // if minAge is not set or if minAge is exceeded, consider at mtime
1850  if ($mtime) {
1851  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1852  if ($row['item_mtime'] != $mtime) {
1853  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1854  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1855  $result = 2;
1856  } else {
1857  // mtime matched the document, so no changes detected and no content updated
1858  $result = -1;
1859  if ($this->tstamp_maxAge) {
1860  $this->‪log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - ‪$GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1861  } else {
1862  $this->‪updateTstamp($phash);
1863  $this->‪log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1864  }
1865  }
1866  } else {
1867  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1868  $result = 3;
1869  }
1870  } else {
1871  // The minimum age was not exceeded
1872  $result = -2;
1873  }
1874  }
1875  } else {
1876  // Page has never been indexed (is not represented in the index_phash table).
1877  $result = 4;
1878  }
1879  }
1880  return $result;
1881  }
1882 
1888  public function ‪checkContentHash()
1889  {
1890  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1891  $result = true;
1892  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1893  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1894  ->select(
1895  ['phash'],
1896  'index_phash',
1897  [
1898  'phash_grouping' => (int)$this->hash['phash_grouping'],
1899  'contentHash' => (int)$this->content_md5h
1900  ],
1901  [],
1902  [],
1903  1
1904  )
1905  ->fetch();
1906 
1907  if (!empty($row)) {
1908  $result = $row;
1909  }
1910  }
1911  return $result;
1912  }
1913 
1922  public function ‪checkExternalDocContentHash($hashGr, ‪$content_md5h)
1923  {
1924  $result = true;
1925  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1926  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1927  ->getConnectionForTable('index_phash')
1928  ->count(
1929  '*',
1930  'index_phash',
1931  [
1932  'phash_grouping' => (int)$hashGr,
1933  'contentHash' => (int)‪$content_md5h
1934  ]
1935  );
1936 
1937  $result = $count === 0;
1938  }
1939  return $result;
1940  }
1941 
1948  public function ‪is_grlist_set($phash_x)
1949  {
1950  $result = false;
1951  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1952  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1953  ->getConnectionForTable('index_grlist')
1954  ->count(
1955  'phash_x',
1956  'index_grlist',
1957  ['phash_x' => (int)$phash_x]
1958  );
1959 
1960  $result = $count > 0;
1961  }
1962  return $result;
1963  }
1964 
1972  public function ‪update_grlist($phash, $phash_x)
1973  {
1974  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1975  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1976  ->getConnectionForTable('index_grlist')
1977  ->count(
1978  'phash',
1979  'index_grlist',
1980  [
1981  'phash' => (int)$phash,
1982  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1983  ]
1984  );
1985 
1986  if ($count === 0) {
1987  $this->‪submit_grlist($phash, $phash_x);
1988  $this->‪log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1989  }
1990  }
1991  }
1992 
1999  public function ‪updateTstamp($phash, $mtime = 0)
2000  {
2001  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
2002  return;
2003  }
2004 
2005  $updateFields = [
2006  'tstamp' => ‪$GLOBALS['EXEC_TIME']
2007  ];
2008 
2009  if ($mtime) {
2010  $updateFields['item_mtime'] = (int)$mtime;
2011  }
2012 
2013  GeneralUtility::makeInstance(ConnectionPool::class)
2014  ->getConnectionForTable('index_phash')
2015  ->update(
2016  'index_phash',
2017  $updateFields,
2018  [
2019  'phash' => (int)$phash
2020  ]
2021  );
2022  }
2023 
2029  public function ‪updateSetId($phash)
2030  {
2031  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
2032  return;
2033  }
2034 
2035  GeneralUtility::makeInstance(ConnectionPool::class)
2036  ->getConnectionForTable('index_phash')
2037  ->update(
2038  'index_phash',
2039  [
2040  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
2041  ],
2042  [
2043  'phash' => (int)$phash
2044  ]
2045  );
2046  }
2047 
2054  public function ‪updateParsetime($phash, $parsetime)
2055  {
2056  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
2057  return;
2058  }
2059 
2060  GeneralUtility::makeInstance(ConnectionPool::class)
2061  ->getConnectionForTable('index_phash')
2062  ->update(
2063  'index_phash',
2064  [
2065  'parsetime' => (int)$parsetime
2066  ],
2067  [
2068  'phash' => (int)$phash
2069  ]
2070  );
2071  }
2072 
2076  public function ‪updateRootline()
2077  {
2078  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
2079  return;
2080  }
2081 
2082  $updateFields = [];
2083  $this->‪getRootLineFields($updateFields);
2084 
2085  GeneralUtility::makeInstance(ConnectionPool::class)
2086  ->getConnectionForTable('index_section')
2087  ->update(
2088  'index_section',
2089  $updateFields,
2090  [
2091  'page_id' => (int)$this->conf['id']
2092  ]
2093  );
2094  }
2095 
2102  public function ‪getRootLineFields(array &$fieldArray)
2103  {
2104  $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2105  $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2106  $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2107  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2108  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2109  }
2110  }
2111 
2112  /********************************
2113  *
2114  * SQL; Submitting words
2115  *
2116  *******************************/
2122  public function ‪checkWordList($wordListArray)
2123  {
2124  if (!‪IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2125  return;
2126  }
2127 
2128  $wordListArrayCount = count($wordListArray);
2129  $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2130 
2131  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2132  $count = (int)$queryBuilder->count('baseword')
2133  ->from('index_words')
2134  ->where(
2135  $queryBuilder->expr()->in(
2136  'wid',
2137  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2138  )
2139  )
2140  ->execute()
2141  ->fetchColumn();
2142 
2143  if ($count !== $wordListArrayCount) {
2144  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2145  $queryBuilder = $connection->createQueryBuilder();
2146 
2147  $result = $queryBuilder->select('baseword')
2148  ->from('index_words')
2149  ->where(
2150  $queryBuilder->expr()->in(
2151  'wid',
2152  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2153  )
2154  )
2155  ->execute();
2156 
2157  $this->‪log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2158  while ($row = $result->fetch()) {
2159  unset($wordListArray[$row['baseword']]);
2160  }
2161 
2162  foreach ($wordListArray as $key => $val) {
2163  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2164  // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2165  // this is not a problem.
2166  $connection->insert(
2167  'index_words',
2168  [
2169  'wid' => $val['hash'],
2170  'baseword' => $key,
2171  'metaphone' => $val['metaphone']
2172  ]
2173  );
2174  }
2175  }
2176  }
2177 
2184  public function ‪submitWords($wordList, $phash)
2185  {
2186  if (!‪IndexedSearchUtility::isTableUsed('index_rel')) {
2187  return;
2188  }
2189  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2190  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2191  $result = $queryBuilder->select('wid')
2192  ->from('index_words')
2193  ->where(
2194  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2195  )
2196  ->groupBy('wid')
2197  ->execute();
2198 
2199  $stopWords = [];
2200  while ($row = $result->fetch()) {
2201  $stopWords[$row['wid']] = $row;
2202  }
2203 
2204  $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2205 
2206  ‪$fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2207  $rows = [];
2208  foreach ($wordList as $val) {
2209  if (isset($stopWords[$val['hash']])) {
2210  continue;
2211  }
2212  $rows[] = [
2213  (int)$phash,
2214  (int)$val['hash'],
2215  (int)$val['count'],
2216  (int)$val['first'],
2217  $this->‪freqMap($val['count'] / $this->wordcount),
2218  $val['cmp'] & ‪$this->flagBitMask
2219  ];
2220  }
2222  if (!empty($rows)) {
2223  $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, ‪$fields);
2224  }
2225  }
2226 
2234  public function ‪freqMap($freq)
2235  {
2236  $mapFactor = $this->freqMax * 100 * ‪$this->freqRange;
2237  if ($freq <= 1) {
2238  $newFreq = $freq * $mapFactor;
2239  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2240  } else {
2241  $newFreq = $freq / $mapFactor;
2242  }
2243  return (int)$newFreq;
2244  }
2245 
2246  /********************************
2247  *
2248  * Hashing
2249  *
2250  *******************************/
2254  public function ‪setT3Hashes()
2255  {
2256  // Set main array:
2257  $hArray = [
2258  'id' => (int)$this->conf['id'],
2259  'type' => (int)$this->conf['type'],
2260  'sys_lang' => (int)$this->conf['sys_language_uid'],
2261  'MP' => (string)$this->conf['MP'],
2262  'cHash' => ‪$this->cHashParams,
2263  'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
2264  ];
2265  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2266  $this->hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
2267  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2268  $hArray['gr_list'] = (string)$this->conf['gr_list'];
2269  $this->hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
2270  }
2271 
2279  public function ‪setExtHashes($file, $subinfo = [])
2280  {
2281  // Set main array:
2282  ‪$hash = [];
2283  $hArray = [
2284  'file' => $file
2285  ];
2286  // Set grouping hash:
2287  ‪$hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
2288  // Add subinfo
2289  $hArray['subinfo'] = $subinfo;
2290  ‪$hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
2291  return ‪$hash;
2292  }
2293 
2294  /*********************************
2295  *
2296  * Internal logging functions
2297  *
2298  *********************************/
2305  public function ‪log_push($msg, $key)
2306  {
2307  $this->timeTracker->push($msg, $key);
2308  }
2309 
2313  public function ‪log_pull()
2314  {
2315  $this->timeTracker->pull();
2316  }
2317 
2324  public function ‪log_setTSlogMessage($msg, $errorNum = 0)
2325  {
2326  $this->timeTracker->setTSlogMessage($msg, $errorNum);
2327  $this->internal_log[] = $msg;
2328  }
2329 
2338  protected function ‪addSpacesToKeywordList($keywordList)
2339  {
2340  $keywords = GeneralUtility::trimExplode(',', $keywordList);
2341  return ' ' . implode(', ', $keywords) . ' ';
2342  }
2343 }
‪TYPO3\CMS\IndexedSearch\Indexer\splitHTMLContent
‪array splitHTMLContent($content)
Definition: Indexer.php:612
‪TYPO3\CMS\IndexedSearch\Indexer\updateParsetime
‪updateParsetime($phash, $parsetime)
Definition: Indexer.php:2021
‪TYPO3\CMS\Core\Routing\PageArguments
Definition: PageArguments.php:25
‪TYPO3\CMS\IndexedSearch\Indexer\$lexerObj
‪TYPO3 CMS IndexedSearch Lexer $lexerObj
Definition: Indexer.php:217
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\md5inthash
‪static int md5inthash($stringToHash)
Definition: IndexedSearchUtility.php:45
‪TYPO3\CMS\IndexedSearch\Indexer\$csObj
‪TYPO3 CMS Core Charset CharsetConverter $csObj
Definition: Indexer.php:205
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:23
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingDomainURL
‪string createLocalPathUsingDomainURL($sourcePath)
Definition: Indexer.php:998
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static string getPublicPath()
Definition: Environment.php:153
‪TYPO3\CMS\IndexedSearch\Indexer\submit_grlist
‪submit_grlist($hash, $phash_x)
Definition: Indexer.php:1526
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultContentArray
‪array $defaultContentArray
Definition: Indexer.php:113
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingAbsRefPrefix
‪string createLocalPathUsingAbsRefPrefix($sourcePath)
Definition: Indexer.php:1020
‪TYPO3\CMS\IndexedSearch\Indexer\indexRegularDocument
‪indexRegularDocument($file, $force=false, $contentTmpFile='', $altExtension='')
Definition: Indexer.php:1114
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:42
‪TYPO3\CMS\IndexedSearch\Indexer\$externalFileCounter
‪int $externalFileCounter
Definition: Indexer.php:126
‪TYPO3\CMS\IndexedSearch\Indexer\updateSetId
‪updateSetId($phash)
Definition: Indexer.php:1996
‪TYPO3\CMS\IndexedSearch\Indexer\isAllowedLocalFile
‪static bool isAllowedLocalFile($filePath)
Definition: Indexer.php:1093
‪TYPO3\CMS\IndexedSearch\Indexer\$external_parsers
‪array $external_parsers
Definition: Indexer.php:68
‪TYPO3\CMS\IndexedSearch\Indexer\indexAnalyze
‪array indexAnalyze($content)
Definition: Indexer.php:1331
‪TYPO3\CMS\IndexedSearch\Indexer\$indexerConfig
‪array $indexerConfig
Definition: Indexer.php:136
‪TYPO3
‪TYPO3\CMS\IndexedSearch\Indexer\$flagBitMask
‪bool $flagBitMask
Definition: Indexer.php:221
‪TYPO3\CMS\IndexedSearch\Indexer\isRelativeURL
‪static bool isRelativeURL($url)
Definition: Indexer.php:1081
‪TYPO3\CMS\IndexedSearch\Indexer\$storeMetaphoneInfoAsWords
‪bool $storeMetaphoneInfoAsWords
Definition: Indexer.php:194
‪TYPO3\CMS\IndexedSearch\Indexer\$cHashParams
‪array $cHashParams
Definition: Indexer.php:176
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:31
‪TYPO3\CMS\IndexedSearch\Indexer\indexExternalUrl
‪indexExternalUrl($externalUrl)
Definition: Indexer.php:898
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeHeaderinfo
‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
Definition: Indexer.php:1349
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedPages
‪removeOldIndexedPages($phash)
Definition: Indexer.php:1569
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\isTableUsed
‪static bool isTableUsed($tableName)
Definition: IndexedSearchUtility.php:32
‪TYPO3\CMS\IndexedSearch\Indexer\log_pull
‪log_pull()
Definition: Indexer.php:2280
‪TYPO3\CMS\IndexedSearch\Indexer\$freqMax
‪float $freqMax
Definition: Indexer.php:186
‪TYPO3\CMS\IndexedSearch\Indexer\checkContentHash
‪mixed checkContentHash()
Definition: Indexer.php:1855
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromT3vars
‪string createLocalPathFromT3vars($sourcePath)
Definition: Indexer.php:976
‪TYPO3\CMS\IndexedSearch\Indexer\convertHTMLToUtf8
‪string convertHTMLToUtf8($content, $charset='')
Definition: Indexer.php:680
‪$fields
‪$fields
Definition: pages.php:4
‪TYPO3\CMS\IndexedSearch\Indexer\bodyDescription
‪string bodyDescription($contentArr)
Definition: Indexer.php:1313
‪TYPO3\CMS\IndexedSearch\Indexer\update_grlist
‪update_grlist($phash, $phash_x)
Definition: Indexer.php:1939
‪TYPO3\CMS\Core\Context\Context
Definition: Context.php:49
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static string basename($path)
Definition: PathUtility.php:164
‪TYPO3\CMS\IndexedSearch\Indexer\freqMap
‪int freqMap($freq)
Definition: Indexer.php:2201
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromAbsoluteURL
‪string createLocalPathFromAbsoluteURL($sourcePath)
Definition: Indexer.php:1044
‪TYPO3\CMS\IndexedSearch\Indexer\$forceIndexing
‪bool $forceIndexing
Definition: Indexer.php:101
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_section
‪submitFile_section($hash)
Definition: Indexer.php:1732
‪TYPO3\CMS\IndexedSearch\Indexer\$file_phash_arr
‪array $file_phash_arr
Definition: Indexer.php:148
‪TYPO3\CMS\IndexedSearch\Indexer\$wordcount
‪int $wordcount
Definition: Indexer.php:122
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedFiles
‪removeOldIndexedFiles($phash)
Definition: Indexer.php:1766
‪TYPO3\CMS\IndexedSearch\Indexer\updateRootline
‪updateRootline()
Definition: Indexer.php:2043
‪TYPO3\CMS\IndexedSearch\Indexer\is_grlist_set
‪bool is_grlist_set($phash_x)
Definition: Indexer.php:1915
‪TYPO3\CMS\IndexedSearch\Indexer\log_push
‪log_push($msg, $key)
Definition: Indexer.php:2272
‪TYPO3\CMS\IndexedSearch\Indexer\$excludeSections
‪string $excludeSections
Definition: Indexer.php:62
‪TYPO3\CMS\IndexedSearch\Indexer\checkMtimeTstamp
‪int checkMtimeTstamp($mtime, $phash)
Definition: Indexer.php:1792
‪TYPO3\CMS\IndexedSearch\Indexer\$reasons
‪array $reasons
Definition: Indexer.php:49
‪TYPO3\CMS\IndexedSearch\Indexer\$timeTracker
‪TimeTracker $timeTracker
Definition: Indexer.php:225
‪TYPO3\CMS\IndexedSearch\Indexer\$enableMetaphoneSearch
‪bool $enableMetaphoneSearch
Definition: Indexer.php:190
‪TYPO3\CMS\IndexedSearch\Indexer\charsetEntity2utf8
‪charsetEntity2utf8(&$contentArr, $charset)
Definition: Indexer.php:1273
‪TYPO3\CMS\IndexedSearch\Indexer\$indexExternalUrl_content
‪string $indexExternalUrl_content
Definition: Indexer.php:170
‪TYPO3\CMS\IndexedSearch\Indexer\$contentParts
‪array $contentParts
Definition: Indexer.php:154
‪TYPO3\CMS\IndexedSearch\Indexer\$internal_log
‪array $internal_log
Definition: Indexer.php:164
‪TYPO3\CMS\Core\Utility\HttpUtility\buildQueryString
‪static string buildQueryString(array $parameters, string $prependCharacter='', bool $skipEmptyParameters=false)
Definition: HttpUtility.php:160
‪TYPO3\CMS\IndexedSearch\Indexer\fileContentParts
‪array fileContentParts($ext, $absFile)
Definition: Indexer.php:1238
‪TYPO3\CMS\IndexedSearch\Indexer\backend_initIndexer
‪backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=[], $createCHash=false)
Definition: Indexer.php:356
‪TYPO3\CMS\IndexedSearch\Indexer\setExtHashes
‪array setExtHashes($file, $subinfo=[])
Definition: Indexer.php:2246
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_minAge
‪int $tstamp_minAge
Definition: Indexer.php:89
‪TYPO3\CMS\IndexedSearch\Indexer\checkExternalDocContentHash
‪bool checkExternalDocContentHash($hashGr, $content_md5h)
Definition: Indexer.php:1889
‪TYPO3\CMS\IndexedSearch\Indexer\$crawlerActive
‪bool $crawlerActive
Definition: Indexer.php:107
‪TYPO3\CMS\IndexedSearch\Indexer\readFileContent
‪array readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
Definition: Indexer.php:1221
‪TYPO3\CMS\IndexedSearch\Indexer\submitFilePage
‪submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
Definition: Indexer.php:1608
‪TYPO3\CMS\IndexedSearch\Indexer\$hash
‪array $hash
Definition: Indexer.php:142
‪TYPO3\CMS\IndexedSearch\Indexer\submitWords
‪submitWords($wordList, $phash)
Definition: Indexer.php:2151
‪TYPO3\CMS\IndexedSearch\Indexer\$conf
‪array $conf
Definition: Indexer.php:130
‪TYPO3\CMS\Core\Context\LanguageAspect
Definition: LanguageAspect.php:55
‪TYPO3\CMS\IndexedSearch\Indexer\updateTstamp
‪updateTstamp($phash, $mtime=0)
Definition: Indexer.php:1966
‪TYPO3\CMS\IndexedSearch\Indexer\processWordsInArrays
‪array processWordsInArrays($contentArr)
Definition: Indexer.php:1293
‪TYPO3\CMS\IndexedSearch\Indexer\embracingTags
‪bool embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
Definition: Indexer.php:705
‪TYPO3\CMS\IndexedSearch\Indexer\backend_setFreeIndexUid
‪backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
Definition: Indexer.php:409
‪TYPO3\CMS\IndexedSearch\Indexer\extractLinks
‪extractLinks($content)
Definition: Indexer.php:760
‪TYPO3\CMS\IndexedSearch\Indexer\indexTypo3PageContent
‪indexTypo3PageContent()
Definition: Indexer.php:533
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:31
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_grlist
‪submitFile_grlist($hash)
Definition: Indexer.php:1686
‪TYPO3\CMS\IndexedSearch\Indexer\getRootLineFields
‪getRootLineFields(array &$fieldArray)
Definition: Indexer.php:2069
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_maxAge
‪int $tstamp_maxAge
Definition: Indexer.php:82
‪TYPO3\CMS\IndexedSearch\Indexer\getUrlHeaders
‪mixed getUrlHeaders($url)
Definition: Indexer.php:924
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneObj
‪TYPO3 CMS IndexedSearch Utility DoubleMetaPhoneUtility $metaphoneObj
Definition: Indexer.php:211
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:97
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:5
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:39
‪TYPO3\CMS\IndexedSearch\Indexer\submitPage
‪submitPage()
Definition: Indexer.php:1437
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneContent
‪string $metaphoneContent
Definition: Indexer.php:198
‪TYPO3\CMS\IndexedSearch\Indexer\$maxExternalFiles
‪int $maxExternalFiles
Definition: Indexer.php:95
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:21
‪TYPO3\CMS\IndexedSearch\Indexer\log_setTSlogMessage
‪log_setTSlogMessage($msg, $errorNum=0)
Definition: Indexer.php:2291
‪TYPO3\CMS\IndexedSearch\Indexer\getHTMLcharset
‪string getHTMLcharset($content)
Definition: Indexer.php:664
‪TYPO3\CMS\Core\Utility\HttpUtility
Definition: HttpUtility.php:21
‪TYPO3\CMS\IndexedSearch\Indexer\extractHyperLinks
‪array extractHyperLinks($html)
Definition: Indexer.php:838
‪TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait
Definition: PublicPropertyDeprecationTrait.php:66
‪TYPO3\CMS\IndexedSearch\Indexer\$deprecatedPublicProperties
‪array $deprecatedPublicProperties
Definition: Indexer.php:43
‪TYPO3\CMS\IndexedSearch\Indexer\addSpacesToKeywordList
‪string addSpacesToKeywordList($keywordList)
Definition: Indexer.php:2305
‪TYPO3\CMS\IndexedSearch\Indexer\initializeExternalParsers
‪initializeExternalParsers()
Definition: Indexer.php:513
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:44
‪TYPO3\CMS\IndexedSearch\Indexer\__construct
‪__construct()
Definition: Indexer.php:230
‪TYPO3\CMS\IndexedSearch\Indexer\submit_section
‪submit_section($hash, $hash_t3)
Definition: Indexer.php:1549
‪TYPO3\CMS\IndexedSearch\Indexer\splitRegularContent
‪array splitRegularContent($content)
Definition: Indexer.php:1255
‪TYPO3\CMS\IndexedSearch\Indexer
Definition: Indexer.php:38
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:45
‪TYPO3\CMS\IndexedSearch\Indexer\metaphone
‪mixed metaphone($word, $returnRawMetaphoneValue=false)
Definition: Indexer.php:1410
‪TYPO3\CMS\IndexedSearch\Indexer\typoSearchTags
‪bool typoSearchTags(&$body)
Definition: Indexer.php:734
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromRelativeURL
‪string createLocalPathFromRelativeURL($sourcePath)
Definition: Indexer.php:1063
‪TYPO3\CMS\IndexedSearch\Indexer\checkWordList
‪checkWordList($wordListArray)
Definition: Indexer.php:2089
‪TYPO3\CMS\Core\TimeTracker\TimeTracker
Definition: TimeTracker.php:27
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility
Definition: IndexedSearchUtility.php:23
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeBody
‪analyzeBody(&$retArr, $content)
Definition: Indexer.php:1379
‪TYPO3\CMS\IndexedSearch\Indexer\$freqRange
‪int $freqRange
Definition: Indexer.php:182
‪TYPO3\CMS\IndexedSearch\Indexer\extractBaseHref
‪string extractBaseHref($html)
Definition: Indexer.php:867
‪TYPO3\CMS\IndexedSearch\Indexer\setT3Hashes
‪setT3Hashes()
Definition: Indexer.php:2221
‪TYPO3\CMS\IndexedSearch\Indexer\$content_md5h
‪string $content_md5h
Definition: Indexer.php:160
‪TYPO3\CMS\IndexedSearch\Indexer\init
‪init()
Definition: Indexer.php:466
‪TYPO3\CMS\IndexedSearch\Indexer\backend_indexAsTYPO3Page
‪backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
Definition: Indexer.php:427
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultGrList
‪string $defaultGrList
Definition: Indexer.php:76
‪TYPO3\CMS\Core\Database\Connection\PARAM_LOB
‪const PARAM_LOB
Definition: Connection.php:52
‪TYPO3\CMS\IndexedSearch\Indexer\hook_indexContent
‪hook_indexContent(&$pObj)
Definition: Indexer.php:240
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPath
‪string createLocalPath($sourcePath)
Definition: Indexer.php:949