‪TYPO3CMS  11.5
Indexer.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
18 use Psr\Log\LogLevel;
32 
39 {
43  public ‪$reasons = [
44  -1 => 'mtime matched the document, so no changes detected and no content updated',
45  -2 => 'The minimum age was not exceeded',
46  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
47  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
48  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
49  4 => 'Page has never been indexed (is not represented in the index_phash table).',
50  ];
51 
57  public ‪$excludeSections = 'script,style';
58 
64  public ‪$external_parsers = [];
65 
73  public ‪$defaultGrList = '0,-1';
74 
80  public ‪$tstamp_maxAge = 0;
81 
88  public ‪$tstamp_minAge = 0;
89 
95  public ‪$maxExternalFiles = 0;
96 
102  public ‪$forceIndexing = false;
103 
110  'title' => '',
111  'description' => '',
112  'keywords' => '',
113  'body' => '',
114  ];
115 
119  public ‪$wordcount = 0;
120 
124  public ‪$externalFileCounter = 0;
125 
129  public ‪$conf = [];
130 
136  public ‪$indexerConfig = [];
137 
143  public ‪$hash = [];
144 
150  public ‪$file_phash_arr = [];
151 
157  public ‪$contentParts = [];
158 
164  public ‪$content_md5h;
165 
169  public ‪$internal_log = [];
170 
176  public ‪$indexExternalUrl_content = '';
177 
181  public ‪$freqRange = 32000;
182 
186  public ‪$freqMax = 0.1;
187 
191  public ‪$enableMetaphoneSearch = false;
192 
197 
201  public ‪$metaphoneContent = '';
202 
208  public ‪$metaphoneObj;
209 
215  public ‪$lexerObj;
216 
220  public ‪$flagBitMask;
221 
225  protected ‪$timeTracker;
226 
230  public function ‪__construct()
231  {
232  $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
233  // Indexer configuration from Extension Manager interface
234  $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
235  $this->tstamp_minAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
236  $this->tstamp_maxAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
237  $this->maxExternalFiles = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxExternalFiles'] ?? 5), 0, 1000);
238  $this->flagBitMask = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['flagBitMask'] ?? 0), 0, 255);
239  // Workaround: If the extension configuration was not updated yet, the value is not existing
240  $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
241  $this->storeMetaphoneInfoAsWords = !‪IndexedSearchUtility::isTableUsed('index_words') && ‪$this->enableMetaphoneSearch;
242  }
243 
244  /********************************
245  *
246  * Initialization
247  *
248  *******************************/
249 
254  public function ‪init(array $configuration = null)
255  {
256  if (is_array($configuration)) {
257  $this->conf = $configuration;
258  }
259  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
260  $this->‪setT3Hashes();
261  // Initialize external document parsers:
262  // Example configuration, see ext_localconf.php of this file!
263  if ($this->conf['index_externals']) {
265  }
266  // Initialize lexer (class that deconstructs the text into words):
267  $lexerObjectClassName = (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?? false) ? ‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : Lexer::class;
269  $lexer = GeneralUtility::makeInstance($lexerObjectClassName);
270  $this->lexerObj = $lexer;
271  $this->lexerObj->debug = (bool)($this->indexerConfig['debugMode'] ?? false);
272  // Initialize metaphone hook:
273  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
274  if ($this->enableMetaphoneSearch && (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone'] ?? false)) {
276  ‪$metaphoneObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
277  $this->metaphoneObj = ‪$metaphoneObj;
278  $this->metaphoneObj->pObj = $this;
279  }
280  }
281 
288  public function ‪initializeExternalParsers()
289  {
290  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
291  $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
292  $this->external_parsers[$extension]->pObj = $this;
293  // Init parser and if it returns FALSE, unset its entry again:
294  if (!$this->external_parsers[$extension]->initParser($extension)) {
295  unset($this->external_parsers[$extension]);
296  }
297  }
298  }
299 
300  /********************************
301  *
302  * Indexing; TYPO3 pages (HTML content)
303  *
304  *******************************/
308  public function ‪indexTypo3PageContent()
309  {
310  $check = $this->‪checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
311  $is_grlist = $this->‪is_grlist_set($this->hash['phash']);
312  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
313  // Setting message:
314  if ($this->forceIndexing) {
315  $this->‪log_setTSlogMessage('Indexing needed, reason: Forced', LogLevel::NOTICE);
316  } elseif ($check > 0) {
317  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], LogLevel::NOTICE);
318  } else {
319  $this->‪log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', LogLevel::NOTICE);
320  }
321  // Divide into title,keywords,description and body:
322  $this->‪log_push('Split content', '');
323  $this->contentParts = $this->‪splitHTMLContent($this->conf['content']);
324  if ($this->conf['indexedDocTitle']) {
325  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
326  }
327  $this->‪log_pull();
328  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
329  $this->content_md5h = ‪IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
330  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
331  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
332  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
333  $checkCHash = $this->‪checkContentHash();
334  if (!is_array($checkCHash) || $check === 1) {
336  $this->‪log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
337  $this->‪charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
338  $this->‪log_pull();
339  // Splitting words
340  $this->‪log_push('Extract words from content', '');
341  $splitInWords = $this->‪processWordsInArrays($this->contentParts);
342  $this->‪log_pull();
343  // Analyze the indexed words.
344  $this->‪log_push('Analyze the extracted words', '');
345  $indexArr = $this->‪indexAnalyze($splitInWords);
346  $this->‪log_pull();
347  // Submitting page (phash) record
348  $this->‪log_push('Submitting page', '');
349  $this->‪submitPage();
350  $this->‪log_pull();
351  // Check words and submit to word list if not there
352  $this->‪log_push('Check word list and submit words', '');
353  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
354  $indexArr = $this->‪removePhashCollisions($indexArr);
355  $this->‪checkWordList($indexArr);
356  $this->‪submitWords($indexArr, $this->hash['phash']);
357  }
358  $this->‪log_pull();
359  // Set parsetime
360  $this->‪updateParsetime($this->hash['phash'], ‪IndexedSearchUtility::milliseconds() - $Pstart);
361  // Checking external files if configured for.
362  $this->‪log_push('Checking external files', '');
363  if ($this->conf['index_externals']) {
364  $this->‪extractLinks($this->conf['content']);
365  }
366  $this->‪log_pull();
367  } else {
368  // Update the timestamp
369  $this->‪updateTstamp($this->hash['phash'], $this->conf['mtime']);
370  $this->‪updateSetId($this->hash['phash']);
371  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
372  $this->‪update_grlist($checkCHash['phash'], $this->hash['phash']);
373  $this->‪updateRootline();
374  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
375  }
376  } else {
377  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
378  }
379  }
380 
388  public function ‪splitHTMLContent($content)
389  {
390  // divide head from body ( u-ouh :) )
391  $contentArr = ‪$this->defaultContentArray;
392  $contentArr['body'] = stristr($content, '<body') ?: '';
393  $headPart = substr($content, 0, -strlen($contentArr['body']));
394  // get title
395  $this->‪embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
396  $titleParts = explode(':', $contentArr['title'], 2);
397  $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
398  // get keywords and description metatags
399  if ($this->conf['index_metatags']) {
400  $meta = [];
401  $i = 0;
402  while ($this->‪embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
403  $i++;
404  }
405  // @todo The code below stops at first unset tag. Is that correct?
406  for ($i = 0; isset($meta[$i]); $i++) {
407  // decode HTML entities, meta tag content needs to be encoded later
408  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
409  if (stripos(($meta[$i]['name'] ?? ''), 'keywords') !== false) {
410  $contentArr['keywords'] .= ',' . $this->‪addSpacesToKeywordList($meta[$i]['content']);
411  }
412  if (stripos(($meta[$i]['name'] ?? ''), 'description') !== false) {
413  $contentArr['description'] .= ',' . $meta[$i]['content'];
414  }
415  }
416  }
417  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
418  $this->‪typoSearchTags($contentArr['body']);
419  // Get rid of unwanted sections (ie. scripting and style stuff) in body
420  $tagList = explode(',', $this->excludeSections);
421  foreach ($tagList as $tag) {
422  while ($this->‪embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
423  }
424  }
425  // remove tags, but first make sure we don't concatenate words by doing it
426  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
427  $contentArr['body'] = trim(strip_tags($contentArr['body']));
428  $contentArr['keywords'] = trim($contentArr['keywords']);
429  $contentArr['description'] = trim($contentArr['description']);
430  // Return array
431  return $contentArr;
432  }
433 
440  public function ‪getHTMLcharset($content)
441  {
442  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
443  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
444  return $reg2[1];
445  }
446  }
447 
448  return '';
449  }
450 
458  public function ‪convertHTMLToUtf8($content, $charset = '')
459  {
460  // Find charset:
461  $charset = $charset ?: $this->‪getHTMLcharset($content);
462  $charset = trim(strtolower($charset));
463  // Convert charset:
464  if ($charset && $charset !== 'utf-8') {
465  $content = mb_convert_encoding($content, 'utf-8', $charset);
466  }
467  // Convert entities, assuming document is now UTF-8:
468  return html_entity_decode($content);
469  }
470 
483  public function ‪embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
484  {
485  $endTag = '</' . $tagName . '>';
486  $startTag = '<' . $tagName;
487  // stristr used because we want a case-insensitive search for the tag.
488  $isTagInText = stristr($string, $startTag);
489  // if the tag was not found, return FALSE
490  if (!$isTagInText) {
491  return false;
492  }
493  [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
494  $afterTagInText = stristr($isTagInText, $endTag);
495  if ($afterTagInText) {
496  $stringBefore = substr($string, 0, (int)strpos(strtolower($string), strtolower($startTag)));
497  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
498  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
499  } else {
500  $tagContent = '';
501  $stringAfter = $isTagInText;
502  }
503  return true;
504  }
505 
512  public function ‪typoSearchTags(&$body)
513  {
514  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
515  $expBody = $expBody ?: [];
516  if (count($expBody) > 1) {
517  $body = '';
518  $prev = '';
519  foreach ($expBody as $val) {
520  $part = explode('-->', $val, 2);
521  if (trim($part[0]) === 'begin') {
522  $body .= $part[1];
523  $prev = '';
524  } elseif (trim($part[0]) === 'end') {
525  $body .= $prev;
526  } else {
527  $prev = $val;
528  }
529  }
530  return true;
531  }
532  return false;
533  }
534 
540  public function ‪extractLinks($content)
541  {
542  // Get links:
543  $list = $this->‪extractHyperLinks($content);
544  // Traverse links:
545  foreach ($list as $linkInfo) {
546  // Decode entities:
547  if ($linkInfo['localPath']) {
548  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
549  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
550  } else {
551  $linkSource = htmlspecialchars_decode($linkInfo['href']);
552  }
553  // Parse URL:
554  $qParts = parse_url($linkSource);
555  // Check for jumpurl (TYPO3 specific thing...)
556  if (($qParts['query'] ?? false) && str_contains($qParts['query'] ?? '', 'jumpurl=')) {
557  parse_str($qParts['query'], $getP);
558  $linkSource = $getP['jumpurl'];
559  $qParts = parse_url($linkSource);
560  }
561  if (!$linkInfo['localPath'] && ($qParts['scheme'] ?? false)) {
562  if ($this->indexerConfig['indexExternalURLs']) {
563  // Index external URL (http or otherwise)
564  $this->‪indexExternalUrl($linkSource);
565  }
566  } elseif (!($qParts['query'] ?? false)) {
567  $linkSource = urldecode($linkSource);
568  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
569  $localFile = $linkSource;
570  } else {
571  $localFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $linkSource);
572  }
573  if ($localFile && @is_file($localFile)) {
574  // Index local file:
575  if ($linkInfo['localPath']) {
576  $fI = pathinfo($linkSource);
577  $ext = strtolower($fI['extension']);
578  $this->‪indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
579  } else {
580  $this->‪indexRegularDocument($linkSource);
581  }
582  }
583  }
584  }
585  }
586 
594  public function ‪extractHyperLinks($html)
595  {
596  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
597  $htmlParts = $htmlParser->splitTags('a', $html);
598  $hyperLinksData = [];
599  foreach ($htmlParts as $index => $tagData) {
600  if ($index % 2 !== 0) {
601  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
602  $firstTagName = $htmlParser->getFirstTagName($tagData);
603  if (strtolower($firstTagName) === 'a') {
604  if (!empty($tagAttributes[0]['href']) && substr($tagAttributes[0]['href'], 0, 1) !== '#') {
605  $hyperLinksData[] = [
606  'tag' => $tagData,
607  'href' => $tagAttributes[0]['href'],
608  'localPath' => $this->‪createLocalPath(urldecode($tagAttributes[0]['href'])),
609  ];
610  }
611  }
612  }
613  }
614  return $hyperLinksData;
615  }
616 
623  public function ‪extractBaseHref($html)
624  {
625  $href = '';
626  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
627  $htmlParts = $htmlParser->splitTags('base', $html);
628  foreach ($htmlParts as $index => $tagData) {
629  if ($index % 2 !== 0) {
630  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
631  $firstTagName = $htmlParser->getFirstTagName($tagData);
632  if (strtolower($firstTagName) === 'base') {
633  $href = $tagAttributes[0]['href'];
634  if ($href) {
635  break;
636  }
637  }
638  }
639  }
640  return $href;
641  }
642 
643  /******************************************
644  *
645  * Indexing; external URL
646  *
647  ******************************************/
654  public function ‪indexExternalUrl($externalUrl)
655  {
656  // Get headers:
657  $urlHeaders = $this->‪getUrlHeaders($externalUrl);
658  if (is_array($urlHeaders) && stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
659  $content = ($this->indexExternalUrl_content = ‪GeneralUtility::getUrl($externalUrl));
660  if ((string)$content !== '') {
661  // Create temporary file:
662  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
663  ‪GeneralUtility::writeFile($tmpFile, $content);
664  // Index that file:
665  $this->‪indexRegularDocument($externalUrl, true, $tmpFile, 'html');
666  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
667  unlink($tmpFile);
668  }
669  }
670  }
671 
678  public function ‪getUrlHeaders($url)
679  {
680  try {
681  $response = GeneralUtility::makeInstance(RequestFactory::class)->request($url, 'HEAD');
682  $headers = $response->getHeaders();
683  $retVal = [];
684  foreach ($headers as $key => $value) {
685  $retVal[$key] = implode('', $value);
686  }
687  return $retVal;
688  } catch (\Exception $e) {
689  // fail silently if the HTTP request failed
690  return false;
691  }
692  }
693 
700  protected function ‪createLocalPath($sourcePath)
701  {
702  $pathFunctions = [
703  'createLocalPathUsingAbsRefPrefix',
704  'createLocalPathUsingDomainURL',
705  'createLocalPathFromAbsoluteURL',
706  'createLocalPathFromRelativeURL',
707  ];
708  foreach ($pathFunctions as $functionName) {
709  $localPath = $this->{$functionName}($sourcePath);
710  if ($localPath != '') {
711  break;
712  }
713  }
714  return $localPath;
715  }
716 
723  protected function ‪createLocalPathUsingDomainURL($sourcePath)
724  {
725  $localPath = '';
726  $baseURL = ‪$GLOBALS['TYPO3_REQUEST']->getAttribute('normalizedParams')->getSiteUrl();
727  $baseURLLength = strlen($baseURL);
728  if (strpos($sourcePath, $baseURL) === 0) {
729  $sourcePath = substr($sourcePath, $baseURLLength);
730  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
731  if (!self::isAllowedLocalFile($localPath)) {
732  $localPath = '';
733  }
734  }
735  return $localPath;
736  }
737 
745  protected function ‪createLocalPathUsingAbsRefPrefix($sourcePath)
746  {
747  $localPath = '';
748  if (isset(‪$GLOBALS['TSFE']) && ‪$GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
749  $absRefPrefix = ‪$GLOBALS['TSFE']->config['config']['absRefPrefix'];
750  $absRefPrefixLength = strlen($absRefPrefix);
751  if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
752  $sourcePath = substr($sourcePath, $absRefPrefixLength);
753  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
754  if (!self::isAllowedLocalFile($localPath)) {
755  $localPath = '';
756  }
757  }
758  }
759  return $localPath;
760  }
761 
769  protected function ‪createLocalPathFromAbsoluteURL($sourcePath)
770  {
771  $localPath = '';
772  if (substr(($sourcePath[0] ?? ''), 0, 1) === '/') {
773  $sourcePath = substr($sourcePath, 1);
774  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
775  if (!self::isAllowedLocalFile($localPath)) {
776  $localPath = '';
777  }
778  }
779  return $localPath;
780  }
781 
788  protected function ‪createLocalPathFromRelativeURL($sourcePath)
789  {
790  $localPath = '';
791  if (self::isRelativeURL($sourcePath)) {
792  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
793  if (!self::isAllowedLocalFile($localPath)) {
794  $localPath = '';
795  }
796  }
797  return $localPath;
798  }
799 
806  protected static function ‪isRelativeURL($url)
807  {
808  $urlParts = @parse_url($url);
809  return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && substr(($urlParts['path'][0] ?? ''), 0, 1) !== '/';
810  }
811 
818  protected static function ‪isAllowedLocalFile($filePath)
819  {
820  $filePath = GeneralUtility::resolveBackPath($filePath);
821  $insideWebPath = strpos($filePath, ‪Environment::getPublicPath()) === 0;
822  $isFile = is_file($filePath);
823  return $insideWebPath && $isFile;
824  }
825 
826  /******************************************
827  *
828  * Indexing; external files (PDF, DOC, etc)
829  *
830  ******************************************/
839  public function ‪indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
840  {
841  // Init
842  $fI = pathinfo($file);
843  $ext = $altExtension ?: strtolower($fI['extension']);
844  // Create abs-path:
845  if (!$contentTmpFile) {
846  if (!‪PathUtility::isAbsolutePath($file)) {
847  // Relative, prepend public web path:
848  $absFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $file);
849  } else {
850  // Absolute, pass-through:
851  $absFile = $file;
852  }
853  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
854  } else {
855  $absFile = $contentTmpFile;
856  }
857  // Indexing the document:
858  if ($absFile && @is_file($absFile)) {
859  if ($this->external_parsers[$ext] ?? false) {
860  $fileInfo = stat($absFile);
861  $cParts = $this->‪fileContentParts($ext, $absFile);
862  foreach ($cParts as $cPKey) {
863  $this->internal_log = [];
864  $this->‪log_push('Index: ' . str_replace('.', '_', ‪PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
866  $subinfo = ['key' => $cPKey];
867  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
868  $phash_arr = ($this->file_phash_arr = $this->‪setExtHashes($file, $subinfo));
869  $check = $this->‪checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
870  if ($check > 0 || $force) {
871  if ($check > 0) {
872  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], LogLevel::NOTICE);
873  } else {
874  $this->‪log_setTSlogMessage('Indexing forced by flag', LogLevel::NOTICE);
875  }
876  // Check external file counter:
877  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
878  // Divide into title,keywords,description and body:
879  $this->‪log_push('Split content', '');
880  ‪$contentParts = $this->‪readFileContent($ext, $absFile, $cPKey);
881  $this->‪log_pull();
882  if (is_array(‪$contentParts)) {
883  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
885  if ($this->‪checkExternalDocContentHash($phash_arr['phash_grouping'], ‪$content_md5h) || $force) {
886  // Increment counter:
887  $this->externalFileCounter++;
888  // Splitting words
889  $this->‪log_push('Extract words from content', '');
890  $splitInWords = $this->‪processWordsInArrays(‪$contentParts);
891  $this->‪log_pull();
892  // Analyze the indexed words.
893  $this->‪log_push('Analyze the extracted words', '');
894  $indexArr = $this->‪indexAnalyze($splitInWords);
895  $this->‪log_pull();
896  // Submitting page (phash) record
897  $this->‪log_push('Submitting page', '');
898  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
899  $this->‪submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], ‪$content_md5h, ‪$contentParts);
900  $this->‪log_pull();
901  // Check words and submit to word list if not there
902  $this->‪log_push('Check word list and submit words', '');
903  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
904  $indexArr = $this->‪removePhashCollisions($indexArr);
905  $this->‪checkWordList($indexArr);
906  $this->‪submitWords($indexArr, $phash_arr['phash']);
907  }
908  $this->‪log_pull();
909  // Set parsetime
910  $this->‪updateParsetime($phash_arr['phash'], ‪IndexedSearchUtility::milliseconds() - $Pstart);
911  } else {
912  // Update the timestamp
913  $this->‪updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
914  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . ‪$content_md5h . ', has not changed. Timestamp updated.');
915  }
916  } else {
917  $this->‪log_setTSlogMessage('Could not index file! Unsupported extension.');
918  }
919  } else {
920  $this->‪log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
921  }
922  } else {
923  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
924  }
925  // Checking and setting sections:
926  $this->‪submitFile_section($phash_arr['phash']);
927  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
928  $this->‪log_pull();
929  }
930  } else {
931  $this->‪log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
932  }
933  } else {
934  $this->‪log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
935  }
936  }
937 
947  public function ‪readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
948  {
949  $contentArray = null;
950  // Consult relevant external document parser:
951  if (is_object($this->external_parsers[$fileExtension])) {
952  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
953  }
954  return $contentArray;
955  }
956 
964  public function ‪fileContentParts($ext, $absFile)
965  {
966  $cParts = [0];
967  // Consult relevant external document parser:
968  if (is_object($this->external_parsers[$ext])) {
969  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
970  }
971  return $cParts;
972  }
973 
981  public function ‪splitRegularContent($content)
982  {
983  $contentArr = ‪$this->defaultContentArray;
984  $contentArr['body'] = $content;
985  return $contentArr;
986  }
987 
988  /**********************************
989  *
990  * Analysing content, Extracting words
991  *
992  **********************************/
999  public function ‪charsetEntity2utf8(&$contentArr, $charset)
1000  {
1001  // Convert charset if necessary
1002  foreach ($contentArr as $key => $value) {
1003  if ((string)$contentArr[$key] !== '') {
1004  if ($charset !== 'utf-8') {
1005  $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1006  }
1007  // decode all numeric / html-entities in the string to real characters:
1008  $contentArr[$key] = html_entity_decode($contentArr[$key]);
1009  }
1010  }
1011  }
1012 
1019  public function ‪processWordsInArrays($contentArr)
1020  {
1021  // split all parts to words
1022  foreach ($contentArr as $key => $value) {
1023  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1024  }
1025  // For title, keywords, and description we don't want duplicates:
1026  $contentArr['title'] = array_unique($contentArr['title']);
1027  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1028  $contentArr['description'] = array_unique($contentArr['description']);
1029  // Return modified array:
1030  return $contentArr;
1031  }
1032 
1039  public function ‪bodyDescription($contentArr)
1040  {
1041  $bodyDescription = '';
1042  // Setting description
1043  $maxL = ‪MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1044  if ($maxL) {
1045  $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1046  // Shorten the string. If the database has the wrong character set
1047  // set the string is probably truncated again. mb_strcut can not be
1048  // used here because it's not part of the fallback package
1049  // symfony/polyfill-mbstring in case of the missing ext:mbstring.
1050  $bodyDescription = \mb_substr($bodyDescription, 0, $maxL, 'utf-8');
1051  }
1052  return $bodyDescription;
1053  }
1054 
1061  public function ‪indexAnalyze($content)
1062  {
1063  $indexArr = [];
1064  $this->‪analyzeHeaderinfo($indexArr, $content, 'title', 7);
1065  $this->‪analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1066  $this->‪analyzeHeaderinfo($indexArr, $content, 'description', 5);
1067  $this->‪analyzeBody($indexArr, $content);
1068  return $indexArr;
1069  }
1070 
1079  public function ‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1080  {
1081  foreach ($content[$key] as $val) {
1082  $val = mb_substr($val, 0, 60);
1083  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1084  if (!isset($retArr[$val])) {
1085  // Word ID (wid)
1086  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1087  // Metaphone value is also 60 only chars long
1088  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1089  $retArr[$val]['metaphone'] = $metaphone;
1090  }
1091  // Build metaphone fulltext string (can be used for fulltext indexing)
1092  if ($this->storeMetaphoneInfoAsWords) {
1093  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1094  }
1095  // Priority used for flagBitMask feature (see extension configuration)
1096  $retArr[$val]['cmp'] = ($retArr[$val]['cmp'] ?? 0) | 2 ** $offset;
1097  if (!($retArr[$val]['count'] ?? false)) {
1098  $retArr[$val]['count'] = 0;
1099  }
1100 
1101  // Increase number of occurrences
1102  $retArr[$val]['count']++;
1103  $this->wordcount++;
1104  }
1105  }
1106 
1113  public function ‪analyzeBody(&$retArr, $content)
1114  {
1115  foreach ($content['body'] as $key => $val) {
1116  $val = substr($val, 0, 60);
1117  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1118  if (!isset($retArr[$val])) {
1119  // First occurrence (used for ranking results)
1120  $retArr[$val]['first'] = $key;
1121  // Word ID (wid)
1122  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1123  // Metaphone value is also only 60 chars long
1124  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1125  $retArr[$val]['metaphone'] = $metaphone;
1126  }
1127  // Build metaphone fulltext string (can be used for fulltext indexing)
1128  if ($this->storeMetaphoneInfoAsWords) {
1129  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1130  }
1131  if (!($retArr[$val]['count'] ?? false)) {
1132  $retArr[$val]['count'] = 0;
1133  }
1134 
1135  // Increase number of occurrences
1136  $retArr[$val]['count']++;
1137  $this->wordcount++;
1138  }
1139  }
1140 
1148  public function ‪metaphone($word, $returnRawMetaphoneValue = false)
1149  {
1150  if (is_object($this->metaphoneObj)) {
1151  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1152  } else {
1153  // Use native PHP function instead of advanced doubleMetaphone class
1154  $metaphoneRawValue = ‪metaphone($word);
1155  }
1156  if ($returnRawMetaphoneValue) {
1157  $result = $metaphoneRawValue;
1158  } elseif ($metaphoneRawValue !== '') {
1159  // Create hash and return integer
1160  $result = ‪IndexedSearchUtility::md5inthash($metaphoneRawValue);
1161  } else {
1162  $result = 0;
1163  }
1164  return $result;
1165  }
1166 
1167  /********************************
1168  *
1169  * SQL; TYPO3 Pages
1170  *
1171  *******************************/
1175  public function ‪submitPage()
1176  {
1177  // Remove any current data for this phash:
1178  $this->‪removeOldIndexedPages($this->hash['phash']);
1179  // setting new phash_row
1180  ‪$fields = [
1181  'phash' => $this->hash['phash'],
1182  'phash_grouping' => $this->hash['phash_grouping'],
1183  'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1184  'contentHash' => $this->content_md5h,
1185  'data_page_id' => $this->conf['id'],
1186  'data_page_type' => $this->conf['type'],
1187  'data_page_mp' => $this->conf['MP'],
1188  'gr_list' => $this->conf['gr_list'],
1189  'item_type' => 0,
1190  // TYPO3 page
1191  'item_title' => $this->contentParts['title'],
1192  'item_description' => $this->‪bodyDescription($this->contentParts),
1193  'item_mtime' => (int)$this->conf['mtime'],
1194  'item_size' => strlen($this->conf['content']),
1195  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1196  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1197  'item_crdate' => $this->conf['crdate'],
1198  // Creation date of page
1199  'sys_language_uid' => $this->conf['sys_language_uid'],
1200  // Sys language uid of the page. Should reflect which language it DOES actually display!
1201  'externalUrl' => 0,
1202  'recordUid' => (int)$this->conf['recordUid'],
1203  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1204  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1205  ];
1206  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1207  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1208  ->getConnectionForTable('index_phash');
1209  $connection->insert(
1210  'index_phash',
1211  ‪$fields
1212  );
1213  }
1214  // PROCESSING index_section
1215  $this->‪submit_section($this->hash['phash'], $this->hash['phash']);
1216  // PROCESSING index_grlist
1217  $this->‪submit_grlist($this->hash['phash'], $this->hash['phash']);
1218  // PROCESSING index_fulltext
1219  ‪$fields = [
1220  'phash' => $this->hash['phash'],
1221  'fulltextdata' => implode(' ', $this->contentParts),
1222  'metaphonedata' => ‪$this->metaphoneContent,
1223  ];
1224  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1225  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1226  }
1227  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1228  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1229  ->getConnectionForTable('index_fulltext');
1230  $connection->insert('index_fulltext', ‪$fields);
1231  }
1232  // PROCESSING index_debug
1233  if ($this->indexerConfig['debugMode'] ?? false) {
1234  ‪$fields = [
1235  'phash' => $this->hash['phash'],
1236  'debuginfo' => json_encode([
1237  'external_parsers initialized' => array_keys($this->external_parsers),
1238  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1239  'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1240  'logs' => $this->internal_log,
1241  'lexer' => $this->lexerObj->debugString,
1242  ]),
1243  ];
1244  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1245  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1246  ->getConnectionForTable('index_debug');
1247  $connection->insert('index_debug', ‪$fields);
1248  }
1249  }
1250  }
1251 
1259  public function ‪submit_grlist(‪$hash, $phash_x)
1260  {
1261  // Setting the gr_list record
1262  ‪$fields = [
1263  'phash' => ‪$hash,
1264  'phash_x' => $phash_x,
1265  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1266  'gr_list' => $this->conf['gr_list'],
1267  ];
1268  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1269  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1270  ->getConnectionForTable('index_grlist');
1271  $connection->insert('index_grlist', ‪$fields);
1272  }
1273  }
1282  public function ‪submit_section(‪$hash, $hash_t3)
1283  {
1284  ‪$fields = [
1285  'phash' => ‪$hash,
1286  'phash_t3' => $hash_t3,
1287  'page_id' => (int)$this->conf['id'],
1288  ];
1290  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1291  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1292  ->getConnectionForTable('index_section');
1293  $connection->insert('index_section', ‪$fields);
1294  }
1295  }
1296 
1302  public function ‪removeOldIndexedPages($phash)
1303  {
1304  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1305  // there can be nothing else than 1-1 relations here.
1306  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1307  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1308  foreach ($tableArray as $table) {
1310  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1311  }
1312  }
1314  // Removing all index_section records with hash_t3 set to this hash (this includes such
1315  // records set for external media on the page as well!). The re-insert of these records
1316  // are done in indexRegularDocument($file).
1317  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1318  $connectionPool->getConnectionForTable('index_section')
1319  ->delete('index_section', ['phash_t3' => (int)$phash]);
1320  }
1321  }
1322 
1323  /********************************
1324  *
1325  * SQL; External media
1326  *
1327  *******************************/
1341  public function ‪submitFilePage(‪$hash, $file, $subinfo, $ext, $mtime, $ctime, $size, ‪$content_md5h, ‪$contentParts)
1342  {
1343  // Find item Type:
1344  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1345  $storeItemType = $storeItemType ?: $ext;
1346  // Remove any current data for this phash:
1347  $this->‪removeOldIndexedFiles(‪$hash['phash']);
1348  // Split filename:
1349  $fileParts = parse_url($file);
1350  // Setting new
1351  ‪$fields = [
1352  'phash' => ‪$hash['phash'],
1353  'phash_grouping' => ‪$hash['phash_grouping'],
1354  'static_page_arguments' => json_encode($subinfo),
1355  'contentHash' => ‪$content_md5h,
1356  'data_filename' => $file,
1357  'item_type' => $storeItemType,
1358  'item_title' => trim(‪$contentParts['title']) ?: ‪PathUtility::basename($file),
1359  'item_description' => $this->‪bodyDescription(‪$contentParts),
1360  'item_mtime' => $mtime,
1361  'item_size' => $size,
1362  'item_crdate' => $ctime,
1363  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1364  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1365  'gr_list' => $this->conf['gr_list'],
1366  'externalUrl' => ($fileParts['scheme'] ?? false) ? 1 : 0,
1367  'recordUid' => (int)$this->conf['recordUid'],
1368  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1369  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1370  'sys_language_uid' => (int)$this->conf['sys_language_uid'],
1371  ];
1372  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1373  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1374  ->getConnectionForTable('index_phash');
1375  $connection->insert(
1376  'index_phash',
1377  ‪$fields
1378  );
1379  }
1380  // PROCESSING index_fulltext
1381  ‪$fields = [
1382  'phash' => ‪$hash['phash'],
1383  'fulltextdata' => implode(' ', ‪$contentParts),
1384  'metaphonedata' => ‪$this->metaphoneContent,
1385  ];
1386  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1387  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1388  }
1389  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1390  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1391  ->getConnectionForTable('index_fulltext');
1392  $connection->insert('index_fulltext', ‪$fields);
1393  }
1394  // PROCESSING index_debug
1395  if ($this->indexerConfig['debugMode'] ?? false) {
1396  ‪$fields = [
1397  'phash' => ‪$hash['phash'],
1398  'debuginfo' => json_encode([
1399  'static_page_arguments' => $subinfo,
1400  'contentParts' => array_merge(‪$contentParts, ['body' => substr(‪$contentParts['body'], 0, 1000)]),
1401  'logs' => $this->internal_log,
1402  'lexer' => $this->lexerObj->debugString,
1403  ]),
1404  ];
1405  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1406  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1407  ->getConnectionForTable('index_debug');
1408  $connection->insert('index_debug', ‪$fields);
1409  }
1410  }
1411  }
1412 
1418  public function ‪submitFile_grlist(‪$hash)
1419  {
1420  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1421  if (!‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1422  return;
1423  }
1424 
1425  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1426  ->getQueryBuilderForTable('index_grlist');
1427  $count = (int)$queryBuilder->count('*')
1428  ->from('index_grlist')
1429  ->where(
1430  $queryBuilder->expr()->eq(
1431  'phash',
1432  $queryBuilder->createNamedParameter(‪$hash, ‪Connection::PARAM_INT)
1433  ),
1434  $queryBuilder->expr()->orX(
1435  $queryBuilder->expr()->eq(
1436  'hash_gr_list',
1437  $queryBuilder->createNamedParameter(
1438  ‪IndexedSearchUtility::md5inthash($this->defaultGrList),
1440  )
1441  ),
1442  $queryBuilder->expr()->eq(
1443  'hash_gr_list',
1444  $queryBuilder->createNamedParameter(
1445  ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1447  )
1448  )
1449  )
1450  )
1451  ->executeQuery()
1452  ->fetchOne();
1453 
1454  if ($count === 0) {
1456  }
1457  }
1458 
1464  public function ‪submitFile_section(‪$hash)
1465  {
1466  // Testing if there is already a section
1467  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1468  return;
1469  }
1471  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1472  ->getQueryBuilderForTable('index_section');
1473  $count = (int)$queryBuilder->count('phash')
1474  ->from('index_section')
1475  ->where(
1476  $queryBuilder->expr()->eq(
1477  'phash',
1478  $queryBuilder->createNamedParameter(‪$hash, ‪Connection::PARAM_INT)
1479  ),
1480  $queryBuilder->expr()->eq(
1481  'page_id',
1482  $queryBuilder->createNamedParameter($this->conf['id'], ‪Connection::PARAM_INT)
1483  )
1484  )
1485  ->executeQuery()
1486  ->fetchOne();
1487 
1488  if ($count === 0) {
1489  $this->‪submit_section(‪$hash, $this->hash['phash']);
1490  }
1491  }
1492 
1498  public function ‪removeOldIndexedFiles($phash)
1499  {
1500  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1501  // Removing old registrations for tables.
1502  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1503  foreach ($tableArray as $table) {
1504  if (!‪IndexedSearchUtility::isTableUsed($table)) {
1505  continue;
1506  }
1507  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1508  }
1509  }
1510 
1511  /********************************
1512  *
1513  * SQL Helper functions
1514  *
1515  *******************************/
1524  public function ‪checkMtimeTstamp($mtime, $phash)
1525  {
1526  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1527  // Not indexed (not in index_phash)
1528  $result = 4;
1529  } else {
1530  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1531  ->select(
1532  ['item_mtime', 'tstamp'],
1533  'index_phash',
1534  ['phash' => (int)$phash],
1535  [],
1536  [],
1537  1
1538  )
1539  ->fetchAssociative();
1540  // If there was an indexing of the page...:
1541  if (!empty($row)) {
1542  if ($this->tstamp_maxAge && ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_maxAge) {
1543  // If max age is exceeded, index the page
1544  // The configured max-age was exceeded for the document and thus it's indexed.
1545  $result = 1;
1546  } else {
1547  if (!$this->tstamp_minAge || ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_minAge) {
1548  // if minAge is not set or if minAge is exceeded, consider at mtime
1549  if ($mtime) {
1550  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1551  if ($row['item_mtime'] != $mtime) {
1552  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1553  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1554  $result = 2;
1555  } else {
1556  // mtime matched the document, so no changes detected and no content updated
1557  $result = -1;
1558  if ($this->tstamp_maxAge) {
1559  $this->‪log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - ‪$GLOBALS['EXEC_TIME']) . ' seconds to expire time).', LogLevel::WARNING);
1560  } else {
1561  $this->‪updateTstamp($phash);
1562  $this->‪log_setTSlogMessage('mtime matched, timestamp updated.', LogLevel::NOTICE);
1563  }
1564  }
1565  } else {
1566  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1567  $result = 3;
1568  }
1569  } else {
1570  // The minimum age was not exceeded
1571  $result = -2;
1572  }
1573  }
1574  } else {
1575  // Page has never been indexed (is not represented in the index_phash table).
1576  $result = 4;
1577  }
1578  }
1579  return $result;
1580  }
1581 
1587  public function ‪checkContentHash()
1588  {
1589  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1590  $result = true;
1591  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1592  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1593  ->select(
1594  ['phash'],
1595  'index_phash',
1596  [
1597  'phash_grouping' => (int)$this->hash['phash_grouping'],
1598  'contentHash' => (int)$this->content_md5h,
1599  ],
1600  [],
1601  [],
1602  1
1603  )
1604  ->fetchAssociative();
1605 
1606  if (!empty($row)) {
1607  $result = $row;
1608  }
1609  }
1610  return $result;
1611  }
1612 
1621  public function ‪checkExternalDocContentHash($hashGr, ‪$content_md5h)
1622  {
1623  $result = true;
1624  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1625  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1626  ->getConnectionForTable('index_phash')
1627  ->count(
1628  '*',
1629  'index_phash',
1630  [
1631  'phash_grouping' => (int)$hashGr,
1632  'contentHash' => (int)‪$content_md5h,
1633  ]
1634  );
1635 
1636  $result = $count === 0;
1637  }
1638  return $result;
1639  }
1640 
1647  public function ‪is_grlist_set($phash_x)
1648  {
1649  $result = false;
1650  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1651  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1652  ->getConnectionForTable('index_grlist')
1653  ->count(
1654  'phash_x',
1655  'index_grlist',
1656  ['phash_x' => (int)$phash_x]
1657  );
1658 
1659  $result = $count > 0;
1660  }
1661  return $result;
1662  }
1663 
1671  public function ‪update_grlist($phash, $phash_x)
1672  {
1673  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1674  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1675  ->getConnectionForTable('index_grlist')
1676  ->count(
1677  'phash',
1678  'index_grlist',
1679  [
1680  'phash' => (int)$phash,
1681  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1682  ]
1683  );
1684 
1685  if ($count === 0) {
1686  $this->‪submit_grlist($phash, $phash_x);
1687  $this->‪log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', LogLevel::NOTICE);
1688  }
1689  }
1690  }
1691 
1698  public function ‪updateTstamp($phash, $mtime = 0)
1699  {
1701  return;
1702  }
1703 
1704  $updateFields = [
1705  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1706  ];
1707 
1708  if ($mtime) {
1709  $updateFields['item_mtime'] = (int)$mtime;
1710  }
1711 
1712  GeneralUtility::makeInstance(ConnectionPool::class)
1713  ->getConnectionForTable('index_phash')
1714  ->update(
1715  'index_phash',
1716  $updateFields,
1717  [
1718  'phash' => (int)$phash,
1719  ]
1720  );
1721  }
1722 
1728  public function ‪updateSetId($phash)
1729  {
1730  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1731  return;
1732  }
1733 
1734  GeneralUtility::makeInstance(ConnectionPool::class)
1735  ->getConnectionForTable('index_phash')
1736  ->update(
1737  'index_phash',
1738  [
1739  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1740  ],
1741  [
1742  'phash' => (int)$phash,
1743  ]
1744  );
1745  }
1746 
1753  public function ‪updateParsetime($phash, $parsetime)
1754  {
1755  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1756  return;
1757  }
1758 
1759  GeneralUtility::makeInstance(ConnectionPool::class)
1760  ->getConnectionForTable('index_phash')
1761  ->update(
1762  'index_phash',
1763  [
1764  'parsetime' => (int)$parsetime,
1765  ],
1766  [
1767  'phash' => (int)$phash,
1768  ]
1769  );
1770  }
1771 
1775  public function ‪updateRootline()
1776  {
1777  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1778  return;
1779  }
1780 
1781  $updateFields = [];
1782  $this->‪getRootLineFields($updateFields);
1783 
1784  GeneralUtility::makeInstance(ConnectionPool::class)
1785  ->getConnectionForTable('index_section')
1786  ->update(
1787  'index_section',
1788  $updateFields,
1789  [
1790  'page_id' => (int)$this->conf['id'],
1791  ]
1792  );
1793  }
1794 
1801  public function ‪getRootLineFields(array &$fieldArray)
1802  {
1803  $fieldArray['rl0'] = (int)($this->conf['rootline_uids'][0] ?? 0);
1804  $fieldArray['rl1'] = (int)($this->conf['rootline_uids'][1] ?? 0);
1805  $fieldArray['rl2'] = (int)($this->conf['rootline_uids'][2] ?? 0);
1806  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
1807  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1808  }
1809  }
1810 
1811  /********************************
1812  *
1813  * SQL; Submitting words
1814  *
1815  *******************************/
1821  public function ‪checkWordList($wordListArray)
1822  {
1823  if (!‪IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
1824  return;
1825  }
1826 
1827  $wordListArrayCount = count($wordListArray);
1828  $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
1829 
1830  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
1831  $count = (int)$queryBuilder->count('baseword')
1832  ->from('index_words')
1833  ->where(
1834  $queryBuilder->expr()->in(
1835  'wid',
1836  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1837  )
1838  )
1839  ->executeQuery()
1840  ->fetchOne();
1841 
1842  if ($count !== $wordListArrayCount) {
1843  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
1844  $queryBuilder = $connection->createQueryBuilder();
1845 
1846  $result = $queryBuilder->select('wid')
1847  ->from('index_words')
1848  ->where(
1849  $queryBuilder->expr()->in(
1850  'wid',
1851  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1852  )
1853  )
1854  ->executeQuery();
1855 
1856  $this->‪log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), LogLevel::NOTICE);
1857  while ($row = $result->fetchAssociative()) {
1858  foreach ($wordListArray as $baseword => $wordData) {
1859  if ($wordData['hash'] === $row['wid']) {
1860  unset($wordListArray[$baseword]);
1861  }
1862  }
1863  }
1864 
1865  foreach ($wordListArray as $key => $val) {
1866  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
1867  // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
1868  // this is not a problem.
1869  $connection->insert(
1870  'index_words',
1871  [
1872  'wid' => $val['hash'],
1873  'baseword' => $key,
1874  'metaphone' => $val['metaphone'],
1875  ]
1876  );
1877  }
1878  }
1879  }
1880 
1887  public function ‪submitWords($wordList, $phash)
1888  {
1889  if (!‪IndexedSearchUtility::isTableUsed('index_rel')) {
1890  return;
1891  }
1892  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1893  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
1894  $result = $queryBuilder->select('wid')
1895  ->from('index_words')
1896  ->where(
1897  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, ‪Connection::PARAM_INT))
1898  )
1899  ->groupBy('wid')
1900  ->executeQuery();
1901 
1902  $stopWords = [];
1903  while ($row = $result->fetchAssociative()) {
1904  $stopWords[$row['wid']] = $row;
1905  }
1906 
1907  $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
1908 
1909  ‪$fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1910  $rows = [];
1911  foreach ($wordList as $val) {
1912  if (isset($stopWords[$val['hash']])) {
1913  continue;
1914  }
1915  $rows[] = [
1916  (int)$phash,
1917  (int)$val['hash'],
1918  (int)$val['count'],
1919  (int)($val['first'] ?? 0),
1920  $this->‪freqMap($val['count'] / $this->wordcount),
1921  ($val['cmp'] ?? 0) & $this->flagBitMask,
1922  ];
1923  }
1924 
1925  if (!empty($rows)) {
1926  $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, ‪$fields);
1927  }
1928  }
1937  public function ‪freqMap($freq)
1938  {
1939  $mapFactor = $this->freqMax * 100 * ‪$this->freqRange;
1940  if ($freq <= 1) {
1941  $newFreq = $freq * $mapFactor;
1942  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
1943  } else {
1944  $newFreq = $freq / $mapFactor;
1945  }
1946  return (int)$newFreq;
1947  }
1948 
1949  /********************************
1950  *
1951  * Hashing
1952  *
1953  *******************************/
1957  public function ‪setT3Hashes()
1958  {
1959  // Set main array:
1960  $hArray = [
1961  'id' => (int)$this->conf['id'],
1962  'type' => (int)$this->conf['type'],
1963  'sys_lang' => (int)$this->conf['sys_language_uid'],
1964  'MP' => (string)$this->conf['MP'],
1965  'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1966  ];
1967  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1968  $this->hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1969  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1970  $hArray['gr_list'] = (string)$this->conf['gr_list'];
1971  $this->hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1972  }
1973 
1981  public function ‪setExtHashes($file, $subinfo = [])
1982  {
1983  // Set main array:
1984  ‪$hash = [];
1985  $hArray = [
1986  'file' => $file,
1987  ];
1988  // Set grouping hash:
1989  ‪$hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1990  // Add subinfo
1991  $hArray['subinfo'] = $subinfo;
1992  ‪$hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1993  return ‪$hash;
1994  }
1995 
1996  /*********************************
1997  *
1998  * Internal logging functions
1999  *
2000  *********************************/
2007  public function ‪log_push($msg, $key)
2008  {
2009  $this->timeTracker->push($msg, $key);
2010  }
2011 
2015  public function ‪log_pull()
2016  {
2017  $this->timeTracker->pull();
2018  }
2019 
2026  public function ‪log_setTSlogMessage($msg, $logLevel = LogLevel::INFO)
2027  {
2028  $this->timeTracker->setTSlogMessage($msg, $logLevel);
2029  $this->internal_log[] = $msg;
2030  }
2031 
2040  protected function ‪addSpacesToKeywordList($keywordList)
2041  {
2042  $keywords = ‪GeneralUtility::trimExplode(',', $keywordList);
2043  return ' ' . implode(', ', $keywords) . ' ';
2044  }
2045 
2053  private function ‪removePhashCollisions(array $wordList): array
2054  {
2055  $uniquePhashes = [];
2056  foreach ($wordList as $baseword => $wordData) {
2057  if (in_array($wordData['hash'], $uniquePhashes, true)) {
2058  unset($wordList[$baseword]);
2059  continue;
2060  }
2061  $uniquePhashes[] = $wordData['hash'];
2062  }
2063  return $wordList;
2064  }
2065 }
‪TYPO3\CMS\IndexedSearch\Indexer\splitHTMLContent
‪array splitHTMLContent($content)
Definition: Indexer.php:360
‪TYPO3\CMS\IndexedSearch\Indexer\updateParsetime
‪updateParsetime($phash, $parsetime)
Definition: Indexer.php:1725
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static list< string > trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
Definition: GeneralUtility.php:999
‪TYPO3\CMS\IndexedSearch\Indexer\$lexerObj
‪TYPO3 CMS IndexedSearch Lexer $lexerObj
Definition: Indexer.php:189
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\md5inthash
‪static int md5inthash($stringToHash)
Definition: IndexedSearchUtility.php:48
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:25
‪TYPO3\CMS\Core\Database\Connection\PARAM_INT
‪const PARAM_INT
Definition: Connection.php:49
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingDomainURL
‪string createLocalPathUsingDomainURL($sourcePath)
Definition: Indexer.php:695
‪TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
Definition: DoubleMetaPhoneUtility.php:27
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static string getPublicPath()
Definition: Environment.php:206
‪TYPO3\CMS\IndexedSearch\Indexer\submit_grlist
‪submit_grlist($hash, $phash_x)
Definition: Indexer.php:1231
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultContentArray
‪array $defaultContentArray
Definition: Indexer.php:100
‪TYPO3\CMS\Core\Html\HtmlParser
Definition: HtmlParser.php:27
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingAbsRefPrefix
‪string createLocalPathUsingAbsRefPrefix($sourcePath)
Definition: Indexer.php:717
‪TYPO3\CMS\IndexedSearch\Indexer\indexRegularDocument
‪indexRegularDocument($file, $force=false, $contentTmpFile='', $altExtension='')
Definition: Indexer.php:811
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:45
‪TYPO3\CMS\IndexedSearch\Indexer\$externalFileCounter
‪int $externalFileCounter
Definition: Indexer.php:113
‪TYPO3\CMS\IndexedSearch\Indexer\updateSetId
‪updateSetId($phash)
Definition: Indexer.php:1700
‪TYPO3\CMS\IndexedSearch\Indexer\isAllowedLocalFile
‪static bool isAllowedLocalFile($filePath)
Definition: Indexer.php:790
‪TYPO3\CMS\IndexedSearch\Indexer\$external_parsers
‪array $external_parsers
Definition: Indexer.php:61
‪TYPO3\CMS\IndexedSearch\Indexer\indexAnalyze
‪array indexAnalyze($content)
Definition: Indexer.php:1033
‪TYPO3\CMS\IndexedSearch\Indexer\$indexerConfig
‪array $indexerConfig
Definition: Indexer.php:123
‪TYPO3\CMS\IndexedSearch\Indexer\isRelativeURL
‪static bool isRelativeURL($url)
Definition: Indexer.php:778
‪TYPO3\CMS\IndexedSearch\Indexer\$storeMetaphoneInfoAsWords
‪bool $storeMetaphoneInfoAsWords
Definition: Indexer.php:173
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:32
‪TYPO3\CMS\IndexedSearch\Indexer\indexExternalUrl
‪indexExternalUrl($externalUrl)
Definition: Indexer.php:626
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeHeaderinfo
‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
Definition: Indexer.php:1051
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedPages
‪removeOldIndexedPages($phash)
Definition: Indexer.php:1274
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\isTableUsed
‪static bool isTableUsed($tableName)
Definition: IndexedSearchUtility.php:35
‪TYPO3\CMS\IndexedSearch\Indexer\log_pull
‪log_pull()
Definition: Indexer.php:1987
‪TYPO3\CMS\IndexedSearch\Indexer\$freqMax
‪float $freqMax
Definition: Indexer.php:165
‪TYPO3\CMS\IndexedSearch\Indexer\checkContentHash
‪mixed checkContentHash()
Definition: Indexer.php:1559
‪TYPO3\CMS\IndexedSearch\Indexer\convertHTMLToUtf8
‪string convertHTMLToUtf8($content, $charset='')
Definition: Indexer.php:430
‪$fields
‪$fields
Definition: pages.php:5
‪TYPO3\CMS\IndexedSearch\Indexer\bodyDescription
‪string bodyDescription($contentArr)
Definition: Indexer.php:1011
‪TYPO3\CMS\IndexedSearch\Indexer\update_grlist
‪update_grlist($phash, $phash_x)
Definition: Indexer.php:1643
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static string basename($path)
Definition: PathUtility.php:226
‪TYPO3\CMS\IndexedSearch\Indexer\freqMap
‪int freqMap($freq)
Definition: Indexer.php:1909
‪TYPO3\CMS\IndexedSearch\Indexer\$content_md5h
‪int $content_md5h
Definition: Indexer.php:147
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromAbsoluteURL
‪string createLocalPathFromAbsoluteURL($sourcePath)
Definition: Indexer.php:741
‪TYPO3\CMS\IndexedSearch\Indexer\$forceIndexing
‪bool $forceIndexing
Definition: Indexer.php:94
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_section
‪submitFile_section($hash)
Definition: Indexer.php:1436
‪TYPO3\CMS\IndexedSearch\Indexer\$file_phash_arr
‪array $file_phash_arr
Definition: Indexer.php:135
‪TYPO3\CMS\IndexedSearch\Indexer\$wordcount
‪int $wordcount
Definition: Indexer.php:109
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedFiles
‪removeOldIndexedFiles($phash)
Definition: Indexer.php:1470
‪TYPO3\CMS\IndexedSearch\Indexer\updateRootline
‪updateRootline()
Definition: Indexer.php:1747
‪TYPO3\CMS\IndexedSearch\Indexer\is_grlist_set
‪bool is_grlist_set($phash_x)
Definition: Indexer.php:1619
‪TYPO3\CMS\IndexedSearch\Indexer\log_push
‪log_push($msg, $key)
Definition: Indexer.php:1979
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static string false getUrl($url)
Definition: GeneralUtility.php:1697
‪TYPO3\CMS\IndexedSearch\Indexer\$excludeSections
‪string $excludeSections
Definition: Indexer.php:55
‪TYPO3\CMS\IndexedSearch\Indexer\checkMtimeTstamp
‪int checkMtimeTstamp($mtime, $phash)
Definition: Indexer.php:1496
‪TYPO3\CMS\IndexedSearch\Indexer\$reasons
‪array $reasons
Definition: Indexer.php:42
‪TYPO3\CMS\IndexedSearch\Indexer\$timeTracker
‪TimeTracker $timeTracker
Definition: Indexer.php:197
‪TYPO3\CMS\IndexedSearch\Indexer\$enableMetaphoneSearch
‪bool $enableMetaphoneSearch
Definition: Indexer.php:169
‪TYPO3\CMS\IndexedSearch\Indexer\charsetEntity2utf8
‪charsetEntity2utf8(&$contentArr, $charset)
Definition: Indexer.php:971
‪TYPO3\CMS\IndexedSearch\Indexer\$indexExternalUrl_content
‪string $indexExternalUrl_content
Definition: Indexer.php:157
‪TYPO3\CMS\IndexedSearch\Indexer\$contentParts
‪array $contentParts
Definition: Indexer.php:141
‪TYPO3\CMS\IndexedSearch\Indexer\$flagBitMask
‪int $flagBitMask
Definition: Indexer.php:193
‪TYPO3\CMS\IndexedSearch\Indexer\$internal_log
‪array $internal_log
Definition: Indexer.php:151
‪TYPO3\CMS\IndexedSearch\Indexer\fileContentParts
‪array fileContentParts($ext, $absFile)
Definition: Indexer.php:936
‪TYPO3\CMS\IndexedSearch\Indexer\log_setTSlogMessage
‪log_setTSlogMessage($msg, $logLevel=LogLevel::INFO)
Definition: Indexer.php:1998
‪TYPO3\CMS\IndexedSearch\Indexer\setExtHashes
‪array setExtHashes($file, $subinfo=[])
Definition: Indexer.php:1953
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_minAge
‪int $tstamp_minAge
Definition: Indexer.php:82
‪TYPO3\CMS\IndexedSearch\Indexer\checkExternalDocContentHash
‪bool checkExternalDocContentHash($hashGr, $content_md5h)
Definition: Indexer.php:1593
‪TYPO3\CMS\IndexedSearch\Indexer\init
‪init(array $configuration=null)
Definition: Indexer.php:226
‪TYPO3\CMS\IndexedSearch\Indexer\readFileContent
‪array readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
Definition: Indexer.php:919
‪TYPO3\CMS\IndexedSearch\Indexer\submitFilePage
‪submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
Definition: Indexer.php:1313
‪TYPO3\CMS\IndexedSearch\Indexer\$hash
‪array $hash
Definition: Indexer.php:129
‪TYPO3\CMS\IndexedSearch\Indexer\submitWords
‪submitWords($wordList, $phash)
Definition: Indexer.php:1859
‪TYPO3\CMS\IndexedSearch\Indexer\$conf
‪array $conf
Definition: Indexer.php:117
‪TYPO3\CMS\Core\Http\RequestFactory
Definition: RequestFactory.php:31
‪TYPO3\CMS\IndexedSearch\Indexer\updateTstamp
‪updateTstamp($phash, $mtime=0)
Definition: Indexer.php:1670
‪TYPO3\CMS\IndexedSearch\Indexer\processWordsInArrays
‪array processWordsInArrays($contentArr)
Definition: Indexer.php:991
‪TYPO3\CMS\IndexedSearch\Indexer\embracingTags
‪bool embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
Definition: Indexer.php:455
‪TYPO3\CMS\Core\Utility\PathUtility\isAbsolutePath
‪static bool isAbsolutePath($path)
Definition: PathUtility.php:296
‪TYPO3\CMS\IndexedSearch\Indexer\extractLinks
‪extractLinks($content)
Definition: Indexer.php:512
‪TYPO3\CMS\IndexedSearch\Indexer\indexTypo3PageContent
‪indexTypo3PageContent()
Definition: Indexer.php:280
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:38
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_grlist
‪submitFile_grlist($hash)
Definition: Indexer.php:1390
‪TYPO3\CMS\IndexedSearch\Indexer\getRootLineFields
‪getRootLineFields(array &$fieldArray)
Definition: Indexer.php:1773
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_maxAge
‪int $tstamp_maxAge
Definition: Indexer.php:75
‪TYPO3\CMS\IndexedSearch\Indexer\getUrlHeaders
‪mixed getUrlHeaders($url)
Definition: Indexer.php:650
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneObj
‪TYPO3 CMS IndexedSearch Utility DoubleMetaPhoneUtility $metaphoneObj
Definition: Indexer.php:183
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:104
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\milliseconds
‪static int milliseconds()
Definition: IndexedSearchUtility.php:175
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:25
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:43
‪TYPO3\CMS\IndexedSearch\Indexer\submitPage
‪submitPage()
Definition: Indexer.php:1147
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneContent
‪string $metaphoneContent
Definition: Indexer.php:177
‪TYPO3\CMS\IndexedSearch\Indexer\$maxExternalFiles
‪int $maxExternalFiles
Definition: Indexer.php:88
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:22
‪TYPO3\CMS\IndexedSearch\Indexer\getHTMLcharset
‪string getHTMLcharset($content)
Definition: Indexer.php:412
‪TYPO3\CMS\IndexedSearch\Indexer\extractHyperLinks
‪array extractHyperLinks($html)
Definition: Indexer.php:566
‪TYPO3\CMS\IndexedSearch\Indexer\addSpacesToKeywordList
‪string addSpacesToKeywordList($keywordList)
Definition: Indexer.php:2012
‪TYPO3\CMS\IndexedSearch\Indexer\initializeExternalParsers
‪initializeExternalParsers()
Definition: Indexer.php:260
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:46
‪TYPO3\CMS\IndexedSearch\Indexer\__construct
‪__construct()
Definition: Indexer.php:202
‪TYPO3\CMS\IndexedSearch\Indexer\submit_section
‪submit_section($hash, $hash_t3)
Definition: Indexer.php:1254
‪TYPO3\CMS\IndexedSearch\Indexer\splitRegularContent
‪array splitRegularContent($content)
Definition: Indexer.php:953
‪TYPO3\CMS\IndexedSearch\Indexer
Definition: Indexer.php:39
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:50
‪TYPO3\CMS\IndexedSearch\Indexer\metaphone
‪mixed metaphone($word, $returnRawMetaphoneValue=false)
Definition: Indexer.php:1120
‪TYPO3\CMS\Core\Utility\GeneralUtility\writeFile
‪static bool writeFile($file, $content, $changePermissions=false)
Definition: GeneralUtility.php:1722
‪TYPO3\CMS\IndexedSearch\Indexer\typoSearchTags
‪bool typoSearchTags(&$body)
Definition: Indexer.php:484
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromRelativeURL
‪string createLocalPathFromRelativeURL($sourcePath)
Definition: Indexer.php:760
‪TYPO3\CMS\IndexedSearch\Indexer\checkWordList
‪checkWordList($wordListArray)
Definition: Indexer.php:1793
‪TYPO3\CMS\Core\TimeTracker\TimeTracker
Definition: TimeTracker.php:31
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility
Definition: IndexedSearchUtility.php:26
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeBody
‪analyzeBody(&$retArr, $content)
Definition: Indexer.php:1085
‪TYPO3\CMS\IndexedSearch\Indexer\$freqRange
‪int $freqRange
Definition: Indexer.php:161
‪TYPO3\CMS\IndexedSearch\Indexer\extractBaseHref
‪string extractBaseHref($html)
Definition: Indexer.php:595
‪TYPO3\CMS\IndexedSearch\Indexer\setT3Hashes
‪setT3Hashes()
Definition: Indexer.php:1929
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultGrList
‪string $defaultGrList
Definition: Indexer.php:69
‪TYPO3\CMS\IndexedSearch\Indexer\removePhashCollisions
‪array removePhashCollisions(array $wordList)
Definition: Indexer.php:2025
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPath
‪string createLocalPath($sourcePath)
Definition: Indexer.php:672