‪TYPO3CMS  ‪main
Indexer.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
18 use Psr\Log\LogLevel;
32 
39 {
43  public ‪$reasons = [
44  -1 => 'mtime matched the document, so no changes detected and no content updated',
45  -2 => 'The minimum age was not exceeded',
46  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
47  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
48  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
49  4 => 'Page has never been indexed (is not represented in the index_phash table).',
50  ];
51 
57  public ‪$excludeSections = 'script,style';
58 
64  public ‪$external_parsers = [];
65 
73  public ‪$defaultGrList = '0,-1';
74 
80  public ‪$tstamp_maxAge = 0;
81 
88  public ‪$tstamp_minAge = 0;
89 
95  public ‪$maxExternalFiles = 0;
96 
102  public ‪$forceIndexing = false;
103 
110  'title' => '',
111  'description' => '',
112  'keywords' => '',
113  'body' => '',
114  ];
115 
119  public ‪$wordcount = 0;
120 
124  public ‪$externalFileCounter = 0;
125 
129  public ‪$conf = [];
130 
136  public ‪$indexerConfig = [];
137 
143  public ‪$hash = [];
144 
150  public ‪$file_phash_arr = [];
151 
157  public ‪$contentParts = [];
158 
164  public ‪$content_md5h;
165 
169  public ‪$internal_log = [];
170 
176  public ‪$indexExternalUrl_content = '';
177 
181  public ‪$freqRange = 32000;
182 
186  public ‪$freqMax = 0.1;
187 
191  public ‪$enableMetaphoneSearch = false;
192 
197 
201  public ‪$metaphoneContent = '';
202 
207 
213  public ‪$lexerObj;
214 
218  public ‪$flagBitMask;
219 
223  protected ‪$timeTracker;
224 
228  public function ‪__construct()
229  {
230  $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
231  // Indexer configuration from Extension Manager interface
232  $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
233  $this->tstamp_minAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
234  $this->tstamp_maxAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
235  $this->maxExternalFiles = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxExternalFiles'] ?? 5), 0, 1000);
236  $this->flagBitMask = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['flagBitMask'] ?? 0), 0, 255);
237  // Workaround: If the extension configuration was not updated yet, the value is not existing
238  $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
239  $this->storeMetaphoneInfoAsWords = !‪IndexedSearchUtility::isTableUsed('index_words') && ‪$this->enableMetaphoneSearch;
240  }
241 
242  /********************************
243  *
244  * Initialization
245  *
246  *******************************/
247 
252  public function ‪init(array $configuration = null)
253  {
254  if (is_array($configuration)) {
255  $this->conf = $configuration;
256  }
257  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
258  $this->‪setT3Hashes();
259  // Initialize external document parsers:
260  // Example configuration, see ext_localconf.php of this file!
261  if ($this->conf['index_externals']) {
263  }
264  // Initialize lexer (class that deconstructs the text into words):
265  $lexerObjectClassName = (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?? false) ? ‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : Lexer::class;
267  $lexer = GeneralUtility::makeInstance($lexerObjectClassName);
268  $this->lexerObj = $lexer;
269  $this->lexerObj->debug = $this->indexerConfig['debugMode'];
270  // Initialize metaphone hook:
271  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
272  if ($this->enableMetaphoneSearch && (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone'] ?? false)) {
274  ‪$metaphoneObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
275  $this->metaphoneObj = ‪$metaphoneObj;
276  $this->metaphoneObj->pObj = $this;
277  }
278  }
279 
286  public function ‪initializeExternalParsers()
287  {
288  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
289  $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
290  $this->external_parsers[$extension]->pObj = $this;
291  // Init parser and if it returns FALSE, unset its entry again:
292  if (!$this->external_parsers[$extension]->initParser($extension)) {
293  unset($this->external_parsers[$extension]);
294  }
295  }
296  }
297 
298  /********************************
299  *
300  * Indexing; TYPO3 pages (HTML content)
301  *
302  *******************************/
306  public function ‪indexTypo3PageContent()
307  {
308  $check = $this->‪checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
309  $is_grlist = $this->‪is_grlist_set($this->hash['phash']);
310  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
311  // Setting message:
312  if ($this->forceIndexing) {
313  $this->‪log_setTSlogMessage('Indexing needed, reason: Forced', LogLevel::NOTICE);
314  } elseif ($check > 0) {
315  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], LogLevel::NOTICE);
316  } else {
317  $this->‪log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', LogLevel::NOTICE);
318  }
319  // Divide into title,keywords,description and body:
320  $this->‪log_push('Split content', '');
321  $this->contentParts = $this->‪splitHTMLContent($this->conf['content']);
322  if ($this->conf['indexedDocTitle']) {
323  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
324  }
325  $this->‪log_pull();
326  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
327  $this->content_md5h = ‪IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
328  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
329  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
330  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
331  $checkCHash = $this->‪checkContentHash();
332  if (!is_array($checkCHash) || $check === 1) {
334  $this->‪log_push('Converting entities of content', '');
335  $this->‪charsetEntity2utf8($this->contentParts);
336  $this->‪log_pull();
337  // Splitting words
338  $this->‪log_push('Extract words from content', '');
339  $splitInWords = $this->‪processWordsInArrays($this->contentParts);
340  $this->‪log_pull();
341  // Analyze the indexed words.
342  $this->‪log_push('Analyze the extracted words', '');
343  $indexArr = $this->‪indexAnalyze($splitInWords);
344  $this->‪log_pull();
345  // Submitting page (phash) record
346  $this->‪log_push('Submitting page', '');
347  $this->‪submitPage();
348  $this->‪log_pull();
349  // Check words and submit to word list if not there
350  $this->‪log_push('Check word list and submit words', '');
351  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
352  $this->‪checkWordList($indexArr);
353  $this->‪submitWords($indexArr, $this->hash['phash']);
354  }
355  $this->‪log_pull();
356  // Set parsetime
357  $this->‪updateParsetime($this->hash['phash'], ‪IndexedSearchUtility::milliseconds() - $Pstart);
358  // Checking external files if configured for.
359  $this->‪log_push('Checking external files', '');
360  if ($this->conf['index_externals']) {
361  $this->‪extractLinks($this->conf['content']);
362  }
363  $this->‪log_pull();
364  } else {
365  // Update the timestamp
366  $this->‪updateTstamp($this->hash['phash'], $this->conf['mtime']);
367  $this->‪updateSetId($this->hash['phash']);
368  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
369  $this->‪update_grlist($checkCHash['phash'], $this->hash['phash']);
370  $this->‪updateRootline();
371  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
372  }
373  } else {
374  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
375  }
376  }
377 
385  public function ‪splitHTMLContent($content)
386  {
387  // divide head from body ( u-ouh :) )
388  $contentArr = ‪$this->defaultContentArray;
389  $contentArr['body'] = stristr($content, '<body') ?: '';
390  $headPart = substr($content, 0, -strlen($contentArr['body']));
391  // get title
392  $this->‪embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
393  $titleParts = explode(':', $contentArr['title'], 2);
394  $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
395  // get keywords and description metatags
396  if ($this->conf['index_metatags']) {
397  $meta = [];
398  $i = 0;
399  while ($this->‪embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
400  $i++;
401  }
402  // @todo The code below stops at first unset tag. Is that correct?
403  for ($i = 0; isset($meta[$i]); $i++) {
404  // decode HTML entities, meta tag content needs to be encoded later
405  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
406  if (stripos(($meta[$i]['name'] ?? ''), 'keywords') !== false) {
407  $contentArr['keywords'] .= ',' . $this->‪addSpacesToKeywordList($meta[$i]['content']);
408  }
409  if (stripos(($meta[$i]['name'] ?? ''), 'description') !== false) {
410  $contentArr['description'] .= ',' . $meta[$i]['content'];
411  }
412  }
413  }
414  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
415  $this->‪typoSearchTags($contentArr['body']);
416  // Get rid of unwanted sections (ie. scripting and style stuff) in body
417  $tagList = explode(',', $this->excludeSections);
418  foreach ($tagList as $tag) {
419  while ($this->‪embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
420  }
421  }
422  // remove tags, but first make sure we don't concatenate words by doing it
423  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
424  $contentArr['body'] = trim(strip_tags($contentArr['body']));
425  $contentArr['keywords'] = trim($contentArr['keywords']);
426  $contentArr['description'] = trim($contentArr['description']);
427  // Return array
428  return $contentArr;
429  }
430 
437  public function ‪getHTMLcharset($content)
438  {
439  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
440  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
441  return $reg2[1];
442  }
443  }
444 
445  return '';
446  }
447 
455  public function ‪convertHTMLToUtf8($content, $charset = '')
456  {
457  // Find charset:
458  $charset = $charset ?: $this->‪getHTMLcharset($content);
459  $charset = trim(strtolower($charset));
460  // Convert charset:
461  if ($charset && $charset !== 'utf-8') {
462  $content = mb_convert_encoding($content, 'utf-8', $charset);
463  }
464  // Convert entities, assuming document is now UTF-8:
465  return html_entity_decode($content);
466  }
467 
480  public function ‪embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
481  {
482  $endTag = '</' . $tagName . '>';
483  $startTag = '<' . $tagName;
484  // stristr used because we want a case-insensitive search for the tag.
485  $isTagInText = stristr($string, $startTag);
486  // if the tag was not found, return FALSE
487  if (!$isTagInText) {
488  return false;
489  }
490  [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
491  $afterTagInText = stristr($isTagInText, $endTag);
492  if ($afterTagInText) {
493  $stringBefore = substr($string, 0, (int)strpos(strtolower($string), strtolower($startTag)));
494  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
495  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
496  } else {
497  $tagContent = '';
498  $stringAfter = $isTagInText;
499  }
500  return true;
501  }
502 
509  public function ‪typoSearchTags(&$body)
510  {
511  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
512  $expBody = $expBody ?: [];
513  if (count($expBody) > 1) {
514  $body = '';
515  $prev = '';
516  foreach ($expBody as $val) {
517  $part = explode('-->', $val, 2);
518  if (trim($part[0]) === 'begin') {
519  $body .= $part[1];
520  $prev = '';
521  } elseif (trim($part[0]) === 'end') {
522  $body .= $prev;
523  } else {
524  $prev = $val;
525  }
526  }
527  return true;
528  }
529  return false;
530  }
531 
537  public function ‪extractLinks($content)
538  {
539  // Get links:
540  $list = $this->‪extractHyperLinks($content);
541  // Traverse links:
542  foreach ($list as $linkInfo) {
543  // Decode entities:
544  if ($linkInfo['localPath']) {
545  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
546  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
547  } else {
548  $linkSource = htmlspecialchars_decode($linkInfo['href']);
549  }
550  // Parse URL:
551  $qParts = parse_url($linkSource);
552  // Check for jumpurl (TYPO3 specific thing...)
553  if (($qParts['query'] ?? false) && str_contains($qParts['query'] ?? '', 'jumpurl=')) {
554  parse_str($qParts['query'], $getP);
555  $linkSource = $getP['jumpurl'];
556  $qParts = parse_url($linkSource);
557  }
558  if (!$linkInfo['localPath'] && ($qParts['scheme'] ?? false)) {
559  if ($this->indexerConfig['indexExternalURLs']) {
560  // Index external URL (http or otherwise)
561  $this->‪indexExternalUrl($linkSource);
562  }
563  } elseif (!($qParts['query'] ?? false)) {
564  $linkSource = urldecode($linkSource);
565  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
566  $localFile = $linkSource;
567  } else {
568  $localFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $linkSource);
569  }
570  if ($localFile && @is_file($localFile)) {
571  // Index local file:
572  if ($linkInfo['localPath']) {
573  $fI = pathinfo($linkSource);
574  $ext = strtolower($fI['extension']);
575  $this->‪indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
576  } else {
577  $this->‪indexRegularDocument($linkSource);
578  }
579  }
580  }
581  }
582  }
583 
591  public function ‪extractHyperLinks($html)
592  {
593  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
594  $htmlParts = $htmlParser->splitTags('a', $html);
595  $hyperLinksData = [];
596  foreach ($htmlParts as $index => $tagData) {
597  if ($index % 2 !== 0) {
598  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
599  $firstTagName = $htmlParser->getFirstTagName($tagData);
600  if (strtolower($firstTagName) === 'a') {
601  if (!empty($tagAttributes[0]['href']) && substr($tagAttributes[0]['href'], 0, 1) !== '#') {
602  $hyperLinksData[] = [
603  'tag' => $tagData,
604  'href' => $tagAttributes[0]['href'],
605  'localPath' => $this->‪createLocalPath(urldecode($tagAttributes[0]['href'])),
606  ];
607  }
608  }
609  }
610  }
611  return $hyperLinksData;
612  }
613 
620  public function ‪extractBaseHref($html)
621  {
622  $href = '';
623  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
624  $htmlParts = $htmlParser->splitTags('base', $html);
625  foreach ($htmlParts as $index => $tagData) {
626  if ($index % 2 !== 0) {
627  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
628  $firstTagName = $htmlParser->getFirstTagName($tagData);
629  if (strtolower($firstTagName) === 'base') {
630  $href = $tagAttributes[0]['href'];
631  if ($href) {
632  break;
633  }
634  }
635  }
636  }
637  return $href;
638  }
639 
640  /******************************************
641  *
642  * Indexing; external URL
643  *
644  ******************************************/
651  public function ‪indexExternalUrl($externalUrl)
652  {
653  // Get headers:
654  $urlHeaders = $this->‪getUrlHeaders($externalUrl);
655  if (is_array($urlHeaders) && stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
656  $content = ($this->indexExternalUrl_content = ‪GeneralUtility::getUrl($externalUrl));
657  if ((string)$content !== '') {
658  // Create temporary file:
659  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
660  ‪GeneralUtility::writeFile($tmpFile, $content);
661  // Index that file:
662  $this->‪indexRegularDocument($externalUrl, true, $tmpFile, 'html');
663  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
664  unlink($tmpFile);
665  }
666  }
667  }
668 
675  public function ‪getUrlHeaders(‪$url)
676  {
677  try {
678  $response = GeneralUtility::makeInstance(RequestFactory::class)->request(‪$url, 'HEAD');
679  $headers = $response->getHeaders();
680  $retVal = [];
681  foreach ($headers as $key => $value) {
682  $retVal[$key] = implode('', $value);
683  }
684  return $retVal;
685  } catch (\Exception $e) {
686  // fail silently if the HTTP request failed
687  return false;
688  }
689  }
690 
697  protected function ‪createLocalPath($sourcePath)
698  {
699  $pathFunctions = [
700  'createLocalPathUsingAbsRefPrefix',
701  'createLocalPathUsingDomainURL',
702  'createLocalPathFromAbsoluteURL',
703  'createLocalPathFromRelativeURL',
704  ];
705  foreach ($pathFunctions as $functionName) {
706  $localPath = $this->{$functionName}($sourcePath);
707  if ($localPath != '') {
708  break;
709  }
710  }
711  return $localPath;
712  }
713 
720  protected function ‪createLocalPathUsingDomainURL($sourcePath)
721  {
722  $localPath = '';
723  $baseURL = ‪$GLOBALS['TYPO3_REQUEST']->getAttribute('normalizedParams')->getSiteUrl();
724  $baseURLLength = strlen($baseURL);
725  if (str_starts_with($sourcePath, $baseURL)) {
726  $sourcePath = substr($sourcePath, $baseURLLength);
727  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
728  if (!self::isAllowedLocalFile($localPath)) {
729  $localPath = '';
730  }
731  }
732  return $localPath;
733  }
734 
742  protected function ‪createLocalPathUsingAbsRefPrefix($sourcePath)
743  {
744  $localPath = '';
745  if (isset(‪$GLOBALS['TSFE']) && ‪$GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
746  $absRefPrefix = ‪$GLOBALS['TSFE']->config['config']['absRefPrefix'];
747  $absRefPrefixLength = strlen($absRefPrefix);
748  if ($absRefPrefixLength > 0 && str_starts_with($sourcePath, $absRefPrefix)) {
749  $sourcePath = substr($sourcePath, $absRefPrefixLength);
750  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
751  if (!self::isAllowedLocalFile($localPath)) {
752  $localPath = '';
753  }
754  }
755  }
756  return $localPath;
757  }
758 
766  protected function ‪createLocalPathFromAbsoluteURL($sourcePath)
767  {
768  $localPath = '';
769  if (substr(($sourcePath[0] ?? ''), 0, 1) === '/') {
770  $sourcePath = substr($sourcePath, 1);
771  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
772  if (!self::isAllowedLocalFile($localPath)) {
773  $localPath = '';
774  }
775  }
776  return $localPath;
777  }
778 
785  protected function ‪createLocalPathFromRelativeURL($sourcePath)
786  {
787  $localPath = '';
788  if (self::isRelativeURL($sourcePath)) {
789  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
790  if (!self::isAllowedLocalFile($localPath)) {
791  $localPath = '';
792  }
793  }
794  return $localPath;
795  }
796 
803  protected static function ‪isRelativeURL(‪$url)
804  {
805  $urlParts = @parse_url(‪$url);
806  return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && substr(($urlParts['path'][0] ?? ''), 0, 1) !== '/';
807  }
808 
815  protected static function ‪isAllowedLocalFile($filePath)
816  {
817  $filePath = GeneralUtility::resolveBackPath($filePath);
818  $insideWebPath = str_starts_with($filePath, ‪Environment::getPublicPath());
819  $isFile = is_file($filePath);
820  return $insideWebPath && $isFile;
821  }
822 
823  /******************************************
824  *
825  * Indexing; external files (PDF, DOC, etc)
826  *
827  ******************************************/
836  public function ‪indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
837  {
838  // Init
839  $fI = pathinfo($file);
840  $ext = $altExtension ?: strtolower($fI['extension']);
841  // Create abs-path:
842  if (!$contentTmpFile) {
843  if (!‪PathUtility::isAbsolutePath($file)) {
844  // Relative, prepend public web path:
845  $absFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $file);
846  } else {
847  // Absolute, pass-through:
848  $absFile = $file;
849  }
850  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
851  } else {
852  $absFile = $contentTmpFile;
853  }
854  // Indexing the document:
855  if ($absFile && @is_file($absFile)) {
856  if ($this->external_parsers[$ext] ?? false) {
857  $fileInfo = stat($absFile);
858  $cParts = $this->‪fileContentParts($ext, $absFile);
859  foreach ($cParts as $cPKey) {
860  $this->internal_log = [];
861  $this->‪log_push('Index: ' . str_replace('.', '_', ‪PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
863  $subinfo = ['key' => $cPKey];
864  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
865  $phash_arr = ($this->file_phash_arr = $this->‪setExtHashes($file, $subinfo));
866  $check = $this->‪checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
867  if ($check > 0 || $force) {
868  if ($check > 0) {
869  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], LogLevel::NOTICE);
870  } else {
871  $this->‪log_setTSlogMessage('Indexing forced by flag', LogLevel::NOTICE);
872  }
873  // Check external file counter:
874  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
875  // Divide into title,keywords,description and body:
876  $this->‪log_push('Split content', '');
877  ‪$contentParts = $this->‪readFileContent($ext, $absFile, $cPKey);
878  $this->‪log_pull();
879  if (is_array(‪$contentParts)) {
880  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
882  if ($this->‪checkExternalDocContentHash($phash_arr['phash_grouping'], ‪$content_md5h) || $force) {
883  // Increment counter:
884  $this->externalFileCounter++;
885  // Splitting words
886  $this->‪log_push('Extract words from content', '');
887  $splitInWords = $this->‪processWordsInArrays(‪$contentParts);
888  $this->‪log_pull();
889  // Analyze the indexed words.
890  $this->‪log_push('Analyze the extracted words', '');
891  $indexArr = $this->‪indexAnalyze($splitInWords);
892  $this->‪log_pull();
893  // Submitting page (phash) record
894  $this->‪log_push('Submitting page', '');
895  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
896  $this->‪submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], ‪$content_md5h, ‪$contentParts);
897  $this->‪log_pull();
898  // Check words and submit to word list if not there
899  $this->‪log_push('Check word list and submit words', '');
900  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
901  $this->‪checkWordList($indexArr);
902  $this->‪submitWords($indexArr, $phash_arr['phash']);
903  }
904  $this->‪log_pull();
905  // Set parsetime
906  $this->‪updateParsetime($phash_arr['phash'], ‪IndexedSearchUtility::milliseconds() - $Pstart);
907  } else {
908  // Update the timestamp
909  $this->‪updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
910  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . ‪$content_md5h . ', has not changed. Timestamp updated.');
911  }
912  } else {
913  $this->‪log_setTSlogMessage('Could not index file! Unsupported extension.');
914  }
915  } else {
916  $this->‪log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
917  }
918  } else {
919  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
920  }
921  // Checking and setting sections:
922  $this->‪submitFile_section($phash_arr['phash']);
923  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
924  $this->‪log_pull();
925  }
926  } else {
927  $this->‪log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
928  }
929  } else {
930  $this->‪log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
931  }
932  }
933 
943  public function ‪readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
944  {
945  $contentArray = null;
946  // Consult relevant external document parser:
947  if (is_object($this->external_parsers[$fileExtension])) {
948  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
949  }
950  return $contentArray;
951  }
952 
960  public function ‪fileContentParts($ext, $absFile)
961  {
962  $cParts = [0];
963  // Consult relevant external document parser:
964  if (is_object($this->external_parsers[$ext])) {
965  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
966  }
967  return $cParts;
968  }
969 
977  public function ‪splitRegularContent($content)
978  {
979  $contentArr = ‪$this->defaultContentArray;
980  $contentArr['body'] = $content;
981  return $contentArr;
982  }
983 
984  /**********************************
985  *
986  * Analysing content, Extracting words
987  *
988  **********************************/
994  public function ‪charsetEntity2utf8(&$contentArr)
995  {
996  // Convert charset if necessary
997  foreach ($contentArr as $key => $value) {
998  if ((string)$contentArr[$key] !== '') {
999  // decode all numeric / html-entities in the string to real characters:
1000  $contentArr[$key] = html_entity_decode($contentArr[$key]);
1001  }
1002  }
1003  }
1011  public function ‪processWordsInArrays($contentArr)
1012  {
1013  // split all parts to words
1014  foreach ($contentArr as $key => $value) {
1015  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1016  }
1017  // For title, keywords, and description we don't want duplicates:
1018  $contentArr['title'] = array_unique($contentArr['title']);
1019  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1020  $contentArr['description'] = array_unique($contentArr['description']);
1021  // Return modified array:
1022  return $contentArr;
1023  }
1031  public function ‪bodyDescription($contentArr)
1032  {
1033  $bodyDescription = '';
1034  // Setting description
1035  $maxL = ‪MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1036  if ($maxL) {
1037  $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1038  // Shorten the string. If the database has the wrong character set,
1039  // the string is probably truncated again.
1040  $bodyDescription = \mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1041  }
1042  return $bodyDescription;
1043  }
1044 
1051  public function ‪indexAnalyze($content)
1052  {
1053  $indexArr = [];
1054  $this->‪analyzeHeaderinfo($indexArr, $content, 'title', 7);
1055  $this->‪analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1056  $this->‪analyzeHeaderinfo($indexArr, $content, 'description', 5);
1057  $this->‪analyzeBody($indexArr, $content);
1058  return $indexArr;
1059  }
1060 
1069  public function ‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1070  {
1071  foreach ($content[$key] as $val) {
1072  $val = mb_substr($val, 0, 60);
1073  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1074  if (!isset($retArr[$val])) {
1075  // Word ID (wid)
1076  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1077  // Metaphone value is also 60 only chars long
1078  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1079  $retArr[$val]['metaphone'] = $metaphone;
1080  }
1081  // Build metaphone fulltext string (can be used for fulltext indexing)
1082  if ($this->storeMetaphoneInfoAsWords) {
1083  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1084  }
1085  // Priority used for flagBitMask feature (see extension configuration)
1086  $retArr[$val]['cmp'] = ($retArr[$val]['cmp'] ?? 0) | 2 ** $offset;
1087  if (!($retArr[$val]['count'] ?? false)) {
1088  $retArr[$val]['count'] = 0;
1089  }
1090 
1091  // Increase number of occurrences
1092  $retArr[$val]['count']++;
1093  $this->wordcount++;
1094  }
1095  }
1096 
1103  public function ‪analyzeBody(&$retArr, $content)
1104  {
1105  foreach ($content['body'] as $key => $val) {
1106  $val = substr($val, 0, 60);
1107  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1108  if (!isset($retArr[$val])) {
1109  // First occurrence (used for ranking results)
1110  $retArr[$val]['first'] = $key;
1111  // Word ID (wid)
1112  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1113  // Metaphone value is also only 60 chars long
1114  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1115  $retArr[$val]['metaphone'] = $metaphone;
1116  }
1117  // Build metaphone fulltext string (can be used for fulltext indexing)
1118  if ($this->storeMetaphoneInfoAsWords) {
1119  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1120  }
1121  if (!($retArr[$val]['count'] ?? false)) {
1122  $retArr[$val]['count'] = 0;
1123  }
1124 
1125  // Increase number of occurrences
1126  $retArr[$val]['count']++;
1127  $this->wordcount++;
1128  }
1129  }
1130 
1138  public function ‪metaphone($word, $returnRawMetaphoneValue = false)
1139  {
1140  if (is_object($this->metaphoneObj)) {
1141  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1142  } else {
1143  // Use native PHP functions instead of advanced doubleMetaphone class
1144  $metaphoneRawValue = ‪metaphone($word);
1145  }
1146  if ($returnRawMetaphoneValue) {
1147  $result = $metaphoneRawValue;
1148  } elseif ($metaphoneRawValue !== '') {
1149  // Create hash and return integer
1150  $result = ‪IndexedSearchUtility::md5inthash($metaphoneRawValue);
1151  } else {
1152  $result = 0;
1153  }
1154  return $result;
1155  }
1156 
1157  /********************************
1158  *
1159  * SQL; TYPO3 Pages
1160  *
1161  *******************************/
1165  public function ‪submitPage()
1166  {
1167  // Remove any current data for this phash:
1168  $this->‪removeOldIndexedPages($this->hash['phash']);
1169  // setting new phash_row
1170  ‪$fields = [
1171  'phash' => $this->hash['phash'],
1172  'phash_grouping' => $this->hash['phash_grouping'],
1173  'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1174  'contentHash' => $this->content_md5h,
1175  'data_page_id' => $this->conf['id'],
1176  'data_page_type' => $this->conf['type'],
1177  'data_page_mp' => $this->conf['MP'],
1178  'gr_list' => $this->conf['gr_list'],
1179  'item_type' => 0,
1180  // TYPO3 page
1181  'item_title' => $this->contentParts['title'],
1182  'item_description' => $this->‪bodyDescription($this->contentParts),
1183  'item_mtime' => (int)$this->conf['mtime'],
1184  'item_size' => strlen($this->conf['content']),
1185  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1186  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1187  'item_crdate' => $this->conf['crdate'],
1188  // Creation date of page
1189  'sys_language_uid' => $this->conf['sys_language_uid'],
1190  // Sys language uid of the page. Should reflect which language it DOES actually display!
1191  'externalUrl' => 0,
1192  'recordUid' => (int)$this->conf['recordUid'],
1193  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1194  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1195  ];
1196  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1197  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1198  ->getConnectionForTable('index_phash');
1199  $connection->insert(
1200  'index_phash',
1201  ‪$fields
1202  );
1203  }
1204  // PROCESSING index_section
1205  $this->‪submit_section($this->hash['phash'], $this->hash['phash']);
1206  // PROCESSING index_grlist
1207  $this->‪submit_grlist($this->hash['phash'], $this->hash['phash']);
1208  // PROCESSING index_fulltext
1209  ‪$fields = [
1210  'phash' => $this->hash['phash'],
1211  'fulltextdata' => implode(' ', $this->contentParts),
1212  'metaphonedata' => ‪$this->metaphoneContent,
1213  ];
1214  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1215  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1216  }
1217  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1218  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1219  ->getConnectionForTable('index_fulltext');
1220  $connection->insert('index_fulltext', ‪$fields);
1221  }
1222  // PROCESSING index_debug
1223  if ($this->indexerConfig['debugMode']) {
1224  ‪$fields = [
1225  'phash' => $this->hash['phash'],
1226  'debuginfo' => json_encode([
1227  'external_parsers initialized' => array_keys($this->external_parsers),
1228  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1229  'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1230  'logs' => $this->internal_log,
1231  'lexer' => $this->lexerObj->debugString,
1232  ]),
1233  ];
1234  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1235  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1236  ->getConnectionForTable('index_debug');
1237  $connection->insert('index_debug', ‪$fields);
1238  }
1239  }
1240  }
1241 
1249  public function ‪submit_grlist(‪$hash, $phash_x)
1250  {
1251  // Setting the gr_list record
1252  ‪$fields = [
1253  'phash' => ‪$hash,
1254  'phash_x' => $phash_x,
1255  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1256  'gr_list' => $this->conf['gr_list'],
1257  ];
1258  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1259  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1260  ->getConnectionForTable('index_grlist');
1261  $connection->insert('index_grlist', ‪$fields);
1262  }
1263  }
1264 
1272  public function ‪submit_section(‪$hash, $hash_t3)
1273  {
1274  ‪$fields = [
1275  'phash' => ‪$hash,
1276  'phash_t3' => $hash_t3,
1277  'page_id' => (int)$this->conf['id'],
1278  ];
1280  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1281  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1282  ->getConnectionForTable('index_section');
1283  $connection->insert('index_section', ‪$fields);
1284  }
1285  }
1286 
1292  public function ‪removeOldIndexedPages($phash)
1293  {
1294  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1295  // there can be nothing else than 1-1 relations here.
1296  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1297  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1298  foreach ($tableArray as $table) {
1300  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1301  }
1302  }
1303 
1304  // Removing all index_section records with hash_t3 set to this hash (this includes such
1305  // records set for external media on the page as well!). The re-insert of these records
1306  // are done in indexRegularDocument($file).
1307  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1308  $connectionPool->getConnectionForTable('index_section')
1309  ->delete('index_section', ['phash_t3' => (int)$phash]);
1310  }
1311  }
1312 
1313  /********************************
1314  *
1315  * SQL; External media
1316  *
1317  *******************************/
1331  public function ‪submitFilePage(‪$hash, $file, $subinfo, $ext, $mtime, $ctime, $size, ‪$content_md5h, ‪$contentParts)
1332  {
1333  // Find item Type:
1334  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1335  $storeItemType = $storeItemType ?: $ext;
1336  // Remove any current data for this phash:
1337  $this->‪removeOldIndexedFiles(‪$hash['phash']);
1338  // Split filename:
1339  $fileParts = parse_url($file);
1340  // Setting new
1341  ‪$fields = [
1342  'phash' => ‪$hash['phash'],
1343  'phash_grouping' => ‪$hash['phash_grouping'],
1344  'static_page_arguments' => json_encode($subinfo),
1345  'contentHash' => ‪$content_md5h,
1346  'data_filename' => $file,
1347  'item_type' => $storeItemType,
1348  'item_title' => trim(‪$contentParts['title']) ?: ‪PathUtility::basename($file),
1349  'item_description' => $this->‪bodyDescription(‪$contentParts),
1350  'item_mtime' => $mtime,
1351  'item_size' => $size,
1352  'item_crdate' => $ctime,
1353  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1354  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1355  'gr_list' => $this->conf['gr_list'],
1356  'externalUrl' => ($fileParts['scheme'] ?? false) ? 1 : 0,
1357  'recordUid' => (int)$this->conf['recordUid'],
1358  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1359  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1360  'sys_language_uid' => (int)$this->conf['sys_language_uid'],
1361  ];
1362  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1363  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1364  ->getConnectionForTable('index_phash');
1365  $connection->insert(
1366  'index_phash',
1367  ‪$fields
1368  );
1369  }
1370  // PROCESSING index_fulltext
1371  ‪$fields = [
1372  'phash' => ‪$hash['phash'],
1373  'fulltextdata' => implode(' ', ‪$contentParts),
1374  'metaphonedata' => ‪$this->metaphoneContent,
1375  ];
1376  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1377  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1378  }
1379  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1380  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1381  ->getConnectionForTable('index_fulltext');
1382  $connection->insert('index_fulltext', ‪$fields);
1383  }
1384  // PROCESSING index_debug
1385  if ($this->indexerConfig['debugMode']) {
1386  ‪$fields = [
1387  'phash' => ‪$hash['phash'],
1388  'debuginfo' => json_encode([
1389  'static_page_arguments' => $subinfo,
1390  'contentParts' => array_merge(‪$contentParts, ['body' => substr(‪$contentParts['body'], 0, 1000)]),
1391  'logs' => $this->internal_log,
1392  'lexer' => $this->lexerObj->debugString,
1393  ]),
1394  ];
1395  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1396  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1397  ->getConnectionForTable('index_debug');
1398  $connection->insert('index_debug', ‪$fields);
1399  }
1400  }
1401  }
1402 
1408  public function ‪submitFile_grlist(‪$hash)
1409  {
1410  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1411  if (!‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1412  return;
1413  }
1414 
1415  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1416  ->getQueryBuilderForTable('index_grlist');
1417  $count = (int)$queryBuilder->count('*')
1418  ->from('index_grlist')
1419  ->where(
1420  $queryBuilder->expr()->eq(
1421  'phash',
1422  $queryBuilder->createNamedParameter(‪$hash, ‪Connection::PARAM_INT)
1423  ),
1424  $queryBuilder->expr()->or(
1425  $queryBuilder->expr()->eq(
1426  'hash_gr_list',
1427  $queryBuilder->createNamedParameter(
1428  ‪IndexedSearchUtility::md5inthash($this->defaultGrList),
1430  )
1431  ),
1432  $queryBuilder->expr()->eq(
1433  'hash_gr_list',
1434  $queryBuilder->createNamedParameter(
1435  ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1437  )
1438  )
1439  )
1440  )
1441  ->executeQuery()
1442  ->fetchOne();
1443 
1444  if ($count === 0) {
1446  }
1447  }
1448 
1454  public function ‪submitFile_section(‪$hash)
1455  {
1456  // Testing if there is already a section
1457  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1458  return;
1459  }
1460 
1461  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1462  ->getQueryBuilderForTable('index_section');
1463  $count = (int)$queryBuilder->count('phash')
1464  ->from('index_section')
1465  ->where(
1466  $queryBuilder->expr()->eq(
1467  'phash',
1468  $queryBuilder->createNamedParameter(‪$hash, ‪Connection::PARAM_INT)
1469  ),
1470  $queryBuilder->expr()->eq(
1471  'page_id',
1472  $queryBuilder->createNamedParameter($this->conf['id'], ‪Connection::PARAM_INT)
1473  )
1474  )
1475  ->executeQuery()
1476  ->fetchOne();
1477 
1478  if ($count === 0) {
1479  $this->‪submit_section(‪$hash, $this->hash['phash']);
1480  }
1481  }
1482 
1488  public function ‪removeOldIndexedFiles($phash)
1489  {
1490  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1491  // Removing old registrations for tables.
1492  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1493  foreach ($tableArray as $table) {
1494  if (!‪IndexedSearchUtility::isTableUsed($table)) {
1495  continue;
1496  }
1497  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1498  }
1499  }
1500 
1501  /********************************
1502  *
1503  * SQL Helper functions
1504  *
1505  *******************************/
1514  public function ‪checkMtimeTstamp($mtime, $phash)
1515  {
1516  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1517  // Not indexed (not in index_phash)
1518  $result = 4;
1519  } else {
1520  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1521  ->select(
1522  ['item_mtime', 'tstamp'],
1523  'index_phash',
1524  ['phash' => (int)$phash],
1525  [],
1526  [],
1527  1
1528  )
1529  ->fetchAssociative();
1530  // If there was an indexing of the page...:
1531  if (!empty($row)) {
1532  if ($this->tstamp_maxAge && ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_maxAge) {
1533  // If max age is exceeded, index the page
1534  // The configured max-age was exceeded for the document and thus it's indexed.
1535  $result = 1;
1536  } else {
1537  if (!$this->tstamp_minAge || ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_minAge) {
1538  // if minAge is not set or if minAge is exceeded, consider at mtime
1539  if ($mtime) {
1540  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1541  if ($row['item_mtime'] != $mtime) {
1542  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1543  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1544  $result = 2;
1545  } else {
1546  // mtime matched the document, so no changes detected and no content updated
1547  $result = -1;
1548  if ($this->tstamp_maxAge) {
1549  $this->‪log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - ‪$GLOBALS['EXEC_TIME']) . ' seconds to expire time).', LogLevel::WARNING);
1550  } else {
1551  $this->‪updateTstamp($phash);
1552  $this->‪log_setTSlogMessage('mtime matched, timestamp updated.', LogLevel::NOTICE);
1553  }
1554  }
1555  } else {
1556  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1557  $result = 3;
1558  }
1559  } else {
1560  // The minimum age was not exceeded
1561  $result = -2;
1562  }
1563  }
1564  } else {
1565  // Page has never been indexed (is not represented in the index_phash table).
1566  $result = 4;
1567  }
1568  }
1569  return $result;
1570  }
1571 
1577  public function ‪checkContentHash()
1578  {
1579  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1580  $result = true;
1581  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1582  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1583  ->select(
1584  ['phash'],
1585  'index_phash',
1586  [
1587  'phash_grouping' => (int)$this->hash['phash_grouping'],
1588  'contentHash' => (int)$this->content_md5h,
1589  ],
1590  [],
1591  [],
1592  1
1593  )
1594  ->fetchAssociative();
1595 
1596  if (!empty($row)) {
1597  $result = $row;
1598  }
1599  }
1600  return $result;
1601  }
1602 
1611  public function ‪checkExternalDocContentHash($hashGr, ‪$content_md5h)
1612  {
1613  $result = true;
1614  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1615  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1616  ->getConnectionForTable('index_phash')
1617  ->count(
1618  '*',
1619  'index_phash',
1620  [
1621  'phash_grouping' => (int)$hashGr,
1622  'contentHash' => (int)‪$content_md5h,
1623  ]
1624  );
1625 
1626  $result = $count === 0;
1627  }
1628  return $result;
1629  }
1630 
1637  public function ‪is_grlist_set($phash_x)
1638  {
1639  $result = false;
1640  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1641  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1642  ->getConnectionForTable('index_grlist')
1643  ->count(
1644  'phash_x',
1645  'index_grlist',
1646  ['phash_x' => (int)$phash_x]
1647  );
1648 
1649  $result = $count > 0;
1650  }
1651  return $result;
1652  }
1653 
1661  public function ‪update_grlist($phash, $phash_x)
1662  {
1663  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1664  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1665  ->getConnectionForTable('index_grlist')
1666  ->count(
1667  'phash',
1668  'index_grlist',
1669  [
1670  'phash' => (int)$phash,
1671  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1672  ]
1673  );
1674 
1675  if ($count === 0) {
1676  $this->‪submit_grlist($phash, $phash_x);
1677  $this->‪log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', LogLevel::NOTICE);
1678  }
1679  }
1680  }
1681 
1688  public function ‪updateTstamp($phash, $mtime = 0)
1689  {
1690  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1691  return;
1692  }
1693 
1694  $updateFields = [
1695  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1696  ];
1697 
1698  if ($mtime) {
1699  $updateFields['item_mtime'] = (int)$mtime;
1700  }
1701 
1702  GeneralUtility::makeInstance(ConnectionPool::class)
1703  ->getConnectionForTable('index_phash')
1704  ->update(
1705  'index_phash',
1706  $updateFields,
1707  [
1708  'phash' => (int)$phash,
1709  ]
1710  );
1711  }
1712 
1718  public function ‪updateSetId($phash)
1719  {
1720  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1721  return;
1722  }
1723 
1724  GeneralUtility::makeInstance(ConnectionPool::class)
1725  ->getConnectionForTable('index_phash')
1726  ->update(
1727  'index_phash',
1728  [
1729  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1730  ],
1731  [
1732  'phash' => (int)$phash,
1733  ]
1734  );
1735  }
1736 
1743  public function ‪updateParsetime($phash, $parsetime)
1744  {
1745  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1746  return;
1747  }
1748 
1749  GeneralUtility::makeInstance(ConnectionPool::class)
1750  ->getConnectionForTable('index_phash')
1751  ->update(
1752  'index_phash',
1753  [
1754  'parsetime' => (int)$parsetime,
1755  ],
1756  [
1757  'phash' => (int)$phash,
1758  ]
1759  );
1760  }
1761 
1765  public function ‪updateRootline()
1766  {
1767  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1768  return;
1769  }
1770 
1771  $updateFields = [];
1772  $this->‪getRootLineFields($updateFields);
1773 
1774  GeneralUtility::makeInstance(ConnectionPool::class)
1775  ->getConnectionForTable('index_section')
1776  ->update(
1777  'index_section',
1778  $updateFields,
1779  [
1780  'page_id' => (int)$this->conf['id'],
1781  ]
1782  );
1783  }
1791  public function ‪getRootLineFields(array &$fieldArray)
1792  {
1793  $fieldArray['rl0'] = (int)($this->conf['rootline_uids'][0] ?? 0);
1794  $fieldArray['rl1'] = (int)($this->conf['rootline_uids'][1] ?? 0);
1795  $fieldArray['rl2'] = (int)($this->conf['rootline_uids'][2] ?? 0);
1796  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
1797  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1798  }
1799  }
1800 
1801  /********************************
1802  *
1803  * SQL; Submitting words
1804  *
1805  *******************************/
1811  public function ‪checkWordList($wordListArray)
1812  {
1813  if (!‪IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
1814  return;
1815  }
1816 
1817  $wordListArrayCount = count($wordListArray);
1818  $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
1819 
1820  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
1821  $count = (int)$queryBuilder->count('baseword')
1822  ->from('index_words')
1823  ->where(
1824  $queryBuilder->expr()->in(
1825  'wid',
1826  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1827  )
1828  )
1829  ->executeQuery()
1830  ->fetchOne();
1831 
1832  if ($count !== $wordListArrayCount) {
1833  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
1834  $queryBuilder = $connection->createQueryBuilder();
1835 
1836  $result = $queryBuilder->select('baseword')
1837  ->from('index_words')
1838  ->where(
1839  $queryBuilder->expr()->in(
1840  'wid',
1841  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1842  )
1843  )
1844  ->executeQuery();
1845 
1846  $this->‪log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), LogLevel::NOTICE);
1847  while ($row = $result->fetchAssociative()) {
1848  unset($wordListArray[$row['baseword']]);
1849  }
1850 
1851  foreach ($wordListArray as $key => $val) {
1852  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
1853  // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
1854  // this is not a problem.
1855  $connection->insert(
1856  'index_words',
1857  [
1858  'wid' => $val['hash'],
1859  'baseword' => $key,
1860  'metaphone' => $val['metaphone'],
1861  ]
1862  );
1863  }
1864  }
1865  }
1866 
1873  public function ‪submitWords($wordList, $phash)
1874  {
1875  if (!‪IndexedSearchUtility::isTableUsed('index_rel')) {
1876  return;
1877  }
1878  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1879  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
1880  $result = $queryBuilder->select('wid')
1881  ->from('index_words')
1882  ->where(
1883  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, ‪Connection::PARAM_INT))
1884  )
1885  ->groupBy('wid')
1886  ->executeQuery();
1887 
1888  $stopWords = [];
1889  while ($row = $result->fetchAssociative()) {
1890  $stopWords[$row['wid']] = $row;
1891  }
1892 
1893  $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
1894 
1895  ‪$fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1896  $rows = [];
1897  foreach ($wordList as $val) {
1898  if (isset($stopWords[$val['hash']])) {
1899  continue;
1900  }
1901  $rows[] = [
1902  (int)$phash,
1903  (int)$val['hash'],
1904  (int)$val['count'],
1905  (int)($val['first'] ?? 0),
1906  $this->‪freqMap($val['count'] / $this->wordcount),
1907  ($val['cmp'] ?? 0) & $this->flagBitMask,
1908  ];
1909  }
1910 
1911  if (!empty($rows)) {
1912  $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, ‪$fields);
1913  }
1914  }
1915 
1923  public function ‪freqMap($freq)
1924  {
1925  $mapFactor = $this->freqMax * 100 * ‪$this->freqRange;
1926  if ($freq <= 1) {
1927  $newFreq = $freq * $mapFactor;
1928  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
1929  } else {
1930  $newFreq = $freq / $mapFactor;
1931  }
1932  return (int)$newFreq;
1933  }
1934 
1935  /********************************
1936  *
1937  * Hashing
1938  *
1939  *******************************/
1943  public function ‪setT3Hashes()
1944  {
1945  // Set main array:
1946  $hArray = [
1947  'id' => (int)$this->conf['id'],
1948  'type' => (int)$this->conf['type'],
1949  'sys_lang' => (int)$this->conf['sys_language_uid'],
1950  'MP' => (string)$this->conf['MP'],
1951  'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1952  ];
1953  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1954  $this->hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1955  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1956  $hArray['gr_list'] = (string)$this->conf['gr_list'];
1957  $this->hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1958  }
1959 
1967  public function ‪setExtHashes($file, $subinfo = [])
1968  {
1969  // Set main array:
1970  ‪$hash = [];
1971  $hArray = [
1972  'file' => $file,
1973  ];
1974  // Set grouping hash:
1975  ‪$hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1976  // Add subinfo
1977  $hArray['subinfo'] = $subinfo;
1978  ‪$hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1979  return ‪$hash;
1980  }
1981 
1982  /*********************************
1983  *
1984  * Internal logging functions
1985  *
1986  *********************************/
1993  public function ‪log_push($msg, $key)
1994  {
1995  $this->timeTracker->push($msg, $key);
1996  }
1997 
2001  public function ‪log_pull()
2002  {
2003  $this->timeTracker->pull();
2004  }
2005 
2012  public function ‪log_setTSlogMessage($msg, $logLevel = LogLevel::INFO)
2013  {
2014  $this->timeTracker->setTSlogMessage($msg, $logLevel);
2015  $this->internal_log[] = $msg;
2016  }
2017 
2026  protected function ‪addSpacesToKeywordList($keywordList)
2027  {
2028  $keywords = ‪GeneralUtility::trimExplode(',', $keywordList);
2029  return ' ' . implode(', ', $keywords) . ' ';
2030  }
2031 }
‪TYPO3\CMS\IndexedSearch\Indexer\splitHTMLContent
‪array splitHTMLContent($content)
Definition: Indexer.php:358
‪TYPO3\CMS\IndexedSearch\Indexer\updateParsetime
‪updateParsetime($phash, $parsetime)
Definition: Indexer.php:1716
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static list< string > trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
Definition: GeneralUtility.php:916
‪TYPO3\CMS\IndexedSearch\Indexer\$lexerObj
‪TYPO3 CMS IndexedSearch Lexer $lexerObj
Definition: Indexer.php:188
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:27
‪TYPO3\CMS\Core\Database\Connection\PARAM_INT
‪const PARAM_INT
Definition: Connection.php:47
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingDomainURL
‪string createLocalPathUsingDomainURL($sourcePath)
Definition: Indexer.php:693
‪TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
Definition: DoubleMetaPhoneUtility.php:27
‪TYPO3\CMS\Core\Utility\PathUtility\isAbsolutePath
‪static isAbsolutePath(string $path)
Definition: PathUtility.php:286
‪TYPO3\CMS\IndexedSearch\Indexer\submit_grlist
‪submit_grlist($hash, $phash_x)
Definition: Indexer.php:1222
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultContentArray
‪array $defaultContentArray
Definition: Indexer.php:100
‪TYPO3\CMS\Core\Html\HtmlParser
Definition: HtmlParser.php:26
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingAbsRefPrefix
‪string createLocalPathUsingAbsRefPrefix($sourcePath)
Definition: Indexer.php:715
‪TYPO3\CMS\IndexedSearch\Indexer\indexRegularDocument
‪indexRegularDocument($file, $force=false, $contentTmpFile='', $altExtension='')
Definition: Indexer.php:809
‪TYPO3\CMS\IndexedSearch\Indexer\charsetEntity2utf8
‪charsetEntity2utf8(&$contentArr)
Definition: Indexer.php:967
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:47
‪TYPO3\CMS\IndexedSearch\Indexer\$externalFileCounter
‪int $externalFileCounter
Definition: Indexer.php:113
‪TYPO3\CMS\IndexedSearch\Indexer\updateSetId
‪updateSetId($phash)
Definition: Indexer.php:1691
‪TYPO3\CMS\IndexedSearch\Indexer\isAllowedLocalFile
‪static bool isAllowedLocalFile($filePath)
Definition: Indexer.php:788
‪TYPO3\CMS\IndexedSearch\Indexer\$external_parsers
‪array $external_parsers
Definition: Indexer.php:61
‪TYPO3\CMS\IndexedSearch\Indexer\indexAnalyze
‪array indexAnalyze($content)
Definition: Indexer.php:1024
‪TYPO3\CMS\IndexedSearch\Indexer\$indexerConfig
‪array $indexerConfig
Definition: Indexer.php:123
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static getPublicPath()
Definition: Environment.php:187
‪TYPO3\CMS\IndexedSearch\Indexer\isRelativeURL
‪static bool isRelativeURL($url)
Definition: Indexer.php:776
‪TYPO3\CMS\IndexedSearch\Indexer\$storeMetaphoneInfoAsWords
‪bool $storeMetaphoneInfoAsWords
Definition: Indexer.php:173
‪TYPO3\CMS\IndexedSearch\Indexer\indexExternalUrl
‪indexExternalUrl($externalUrl)
Definition: Indexer.php:624
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeHeaderinfo
‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
Definition: Indexer.php:1042
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneObj
‪DoubleMetaPhoneUtility $metaphoneObj
Definition: Indexer.php:182
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedPages
‪removeOldIndexedPages($phash)
Definition: Indexer.php:1265
‪TYPO3\CMS\IndexedSearch\Indexer\log_pull
‪log_pull()
Definition: Indexer.php:1974
‪TYPO3\CMS\IndexedSearch\Indexer\$freqMax
‪float $freqMax
Definition: Indexer.php:165
‪TYPO3\CMS\IndexedSearch\Indexer\checkContentHash
‪mixed checkContentHash()
Definition: Indexer.php:1550
‪TYPO3\CMS\IndexedSearch\Indexer\convertHTMLToUtf8
‪string convertHTMLToUtf8($content, $charset='')
Definition: Indexer.php:428
‪$fields
‪$fields
Definition: pages.php:5
‪TYPO3\CMS\IndexedSearch\Indexer\bodyDescription
‪string bodyDescription($contentArr)
Definition: Indexer.php:1004
‪TYPO3\CMS\IndexedSearch\Indexer\update_grlist
‪update_grlist($phash, $phash_x)
Definition: Indexer.php:1634
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static basename(string $path)
Definition: PathUtility.php:219
‪TYPO3\CMS\IndexedSearch\Indexer\freqMap
‪int freqMap($freq)
Definition: Indexer.php:1896
‪TYPO3\CMS\IndexedSearch\Indexer\$content_md5h
‪int $content_md5h
Definition: Indexer.php:147
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromAbsoluteURL
‪string createLocalPathFromAbsoluteURL($sourcePath)
Definition: Indexer.php:739
‪TYPO3\CMS\IndexedSearch\Indexer\$forceIndexing
‪bool $forceIndexing
Definition: Indexer.php:94
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_section
‪submitFile_section($hash)
Definition: Indexer.php:1427
‪TYPO3\CMS\IndexedSearch\Indexer\$file_phash_arr
‪array $file_phash_arr
Definition: Indexer.php:135
‪TYPO3\CMS\IndexedSearch\Indexer\$wordcount
‪int $wordcount
Definition: Indexer.php:109
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedFiles
‪removeOldIndexedFiles($phash)
Definition: Indexer.php:1461
‪TYPO3\CMS\IndexedSearch\Indexer\updateRootline
‪updateRootline()
Definition: Indexer.php:1738
‪TYPO3\CMS\IndexedSearch\Indexer\is_grlist_set
‪bool is_grlist_set($phash_x)
Definition: Indexer.php:1610
‪TYPO3\CMS\IndexedSearch\Indexer\log_push
‪log_push($msg, $key)
Definition: Indexer.php:1966
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static string false getUrl($url)
Definition: GeneralUtility.php:1542
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\isTableUsed
‪static bool isTableUsed(string $tableName)
Definition: IndexedSearchUtility.php:37
‪TYPO3\CMS\IndexedSearch\Indexer\$excludeSections
‪string $excludeSections
Definition: Indexer.php:55
‪TYPO3\CMS\IndexedSearch\Indexer\checkMtimeTstamp
‪int checkMtimeTstamp($mtime, $phash)
Definition: Indexer.php:1487
‪TYPO3\CMS\IndexedSearch\Indexer\$reasons
‪array $reasons
Definition: Indexer.php:42
‪TYPO3\CMS\IndexedSearch\Indexer\$timeTracker
‪TimeTracker $timeTracker
Definition: Indexer.php:196
‪TYPO3\CMS\IndexedSearch\Indexer\$enableMetaphoneSearch
‪bool $enableMetaphoneSearch
Definition: Indexer.php:169
‪TYPO3\CMS\IndexedSearch\Indexer\$indexExternalUrl_content
‪string $indexExternalUrl_content
Definition: Indexer.php:157
‪TYPO3\CMS\IndexedSearch\Indexer\$contentParts
‪array $contentParts
Definition: Indexer.php:141
‪TYPO3\CMS\IndexedSearch\Indexer\$flagBitMask
‪int $flagBitMask
Definition: Indexer.php:192
‪TYPO3\CMS\IndexedSearch\Indexer\$internal_log
‪array $internal_log
Definition: Indexer.php:151
‪TYPO3\CMS\IndexedSearch\Indexer\fileContentParts
‪array fileContentParts($ext, $absFile)
Definition: Indexer.php:933
‪TYPO3\CMS\IndexedSearch\Indexer\log_setTSlogMessage
‪log_setTSlogMessage($msg, $logLevel=LogLevel::INFO)
Definition: Indexer.php:1985
‪TYPO3\CMS\IndexedSearch\Indexer\setExtHashes
‪array setExtHashes($file, $subinfo=[])
Definition: Indexer.php:1940
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_minAge
‪int $tstamp_minAge
Definition: Indexer.php:82
‪TYPO3\CMS\IndexedSearch\Indexer\checkExternalDocContentHash
‪bool checkExternalDocContentHash($hashGr, $content_md5h)
Definition: Indexer.php:1584
‪TYPO3\CMS\IndexedSearch\Indexer\init
‪init(array $configuration=null)
Definition: Indexer.php:225
‪TYPO3\CMS\IndexedSearch\Indexer\submitFilePage
‪submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
Definition: Indexer.php:1304
‪TYPO3\CMS\IndexedSearch\Indexer\$hash
‪array $hash
Definition: Indexer.php:129
‪TYPO3\CMS\IndexedSearch\Indexer\submitWords
‪submitWords($wordList, $phash)
Definition: Indexer.php:1846
‪TYPO3\CMS\IndexedSearch\Indexer\$conf
‪array $conf
Definition: Indexer.php:117
‪TYPO3\CMS\Core\Http\RequestFactory
Definition: RequestFactory.php:30
‪TYPO3\CMS\IndexedSearch\Indexer\updateTstamp
‪updateTstamp($phash, $mtime=0)
Definition: Indexer.php:1661
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\md5inthash
‪static int md5inthash(string $stringToHash)
Definition: IndexedSearchUtility.php:50
‪TYPO3\CMS\IndexedSearch\Indexer\processWordsInArrays
‪array processWordsInArrays($contentArr)
Definition: Indexer.php:984
‪TYPO3\CMS\IndexedSearch\Indexer\embracingTags
‪bool embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
Definition: Indexer.php:453
‪TYPO3\CMS\IndexedSearch\Indexer\extractLinks
‪extractLinks($content)
Definition: Indexer.php:510
‪TYPO3\CMS\IndexedSearch\Indexer\indexTypo3PageContent
‪indexTypo3PageContent()
Definition: Indexer.php:279
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:36
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_grlist
‪submitFile_grlist($hash)
Definition: Indexer.php:1381
‪TYPO3\CMS\Webhooks\Message\$url
‪identifier readonly UriInterface $url
Definition: LoginErrorOccurredMessage.php:36
‪TYPO3\CMS\IndexedSearch\Indexer\getRootLineFields
‪getRootLineFields(array &$fieldArray)
Definition: Indexer.php:1764
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_maxAge
‪int $tstamp_maxAge
Definition: Indexer.php:75
‪TYPO3\CMS\IndexedSearch\Indexer\getUrlHeaders
‪mixed getUrlHeaders($url)
Definition: Indexer.php:648
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:105
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:25
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:41
‪TYPO3\CMS\IndexedSearch\Indexer\submitPage
‪submitPage()
Definition: Indexer.php:1138
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneContent
‪string $metaphoneContent
Definition: Indexer.php:177
‪TYPO3\CMS\IndexedSearch\Indexer\$maxExternalFiles
‪int $maxExternalFiles
Definition: Indexer.php:88
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:24
‪TYPO3\CMS\IndexedSearch\Indexer\getHTMLcharset
‪string getHTMLcharset($content)
Definition: Indexer.php:410
‪TYPO3\CMS\IndexedSearch\Indexer\extractHyperLinks
‪array extractHyperLinks($html)
Definition: Indexer.php:564
‪TYPO3\CMS\IndexedSearch\Indexer\addSpacesToKeywordList
‪string addSpacesToKeywordList($keywordList)
Definition: Indexer.php:1999
‪TYPO3\CMS\IndexedSearch\Indexer\initializeExternalParsers
‪initializeExternalParsers()
Definition: Indexer.php:259
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:51
‪TYPO3\CMS\IndexedSearch\Indexer\__construct
‪__construct()
Definition: Indexer.php:201
‪TYPO3\CMS\IndexedSearch\Indexer\submit_section
‪submit_section($hash, $hash_t3)
Definition: Indexer.php:1245
‪TYPO3\CMS\IndexedSearch\Indexer\splitRegularContent
‪array splitRegularContent($content)
Definition: Indexer.php:950
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange(mixed $theInt, int $min, int $max=2000000000, int $defaultValue=0)
Definition: MathUtility.php:34
‪TYPO3\CMS\IndexedSearch\Indexer
Definition: Indexer.php:39
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:51
‪TYPO3\CMS\IndexedSearch\Indexer\metaphone
‪mixed metaphone($word, $returnRawMetaphoneValue=false)
Definition: Indexer.php:1111
‪TYPO3\CMS\Core\Utility\GeneralUtility\writeFile
‪static bool writeFile($file, $content, $changePermissions=false)
Definition: GeneralUtility.php:1567
‪TYPO3\CMS\IndexedSearch\Indexer\typoSearchTags
‪bool typoSearchTags(&$body)
Definition: Indexer.php:482
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromRelativeURL
‪string createLocalPathFromRelativeURL($sourcePath)
Definition: Indexer.php:758
‪TYPO3\CMS\IndexedSearch\Indexer\checkWordList
‪checkWordList($wordListArray)
Definition: Indexer.php:1784
‪TYPO3\CMS\Core\TimeTracker\TimeTracker
Definition: TimeTracker.php:32
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility
Definition: IndexedSearchUtility.php:28
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeBody
‪analyzeBody(&$retArr, $content)
Definition: Indexer.php:1076
‪TYPO3\CMS\IndexedSearch\Indexer\$freqRange
‪int $freqRange
Definition: Indexer.php:161
‪TYPO3\CMS\IndexedSearch\Indexer\extractBaseHref
‪string extractBaseHref($html)
Definition: Indexer.php:593
‪TYPO3\CMS\IndexedSearch\Indexer\setT3Hashes
‪setT3Hashes()
Definition: Indexer.php:1916
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultGrList
‪string $defaultGrList
Definition: Indexer.php:69
‪TYPO3\CMS\IndexedSearch\Indexer\readFileContent
‪array null readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
Definition: Indexer.php:916
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\milliseconds
‪static milliseconds()
Definition: IndexedSearchUtility.php:172
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPath
‪string createLocalPath($sourcePath)
Definition: Indexer.php:670