‪TYPO3CMS  ‪main
Indexer.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
18 use Psr\Log\LogLevel;
32 
39 {
43  public ‪$reasons = [
44  -1 => 'mtime matched the document, so no changes detected and no content updated',
45  -2 => 'The minimum age was not exceeded',
46  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
47  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
48  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
49  4 => 'Page has never been indexed (is not represented in the index_phash table).',
50  ];
51 
57  public ‪$excludeSections = 'script,style';
58 
64  public ‪$external_parsers = [];
65 
73  public ‪$defaultGrList = '0,-1';
74 
80  public ‪$tstamp_maxAge = 0;
81 
88  public ‪$tstamp_minAge = 0;
89 
95  public ‪$maxExternalFiles = 0;
96 
102  public ‪$forceIndexing = false;
103 
110  'title' => '',
111  'description' => '',
112  'keywords' => '',
113  'body' => '',
114  ];
115 
119  public ‪$wordcount = 0;
120 
124  public ‪$externalFileCounter = 0;
125 
129  public ‪$conf = [];
130 
136  public ‪$indexerConfig = [];
137 
143  public ‪$hash = [];
144 
150  public ‪$file_phash_arr = [];
151 
157  public ‪$contentParts = [];
158 
164  public ‪$content_md5h;
165 
169  public ‪$internal_log = [];
170 
176  public ‪$indexExternalUrl_content = '';
177 
181  public ‪$freqRange = 32000;
182 
186  public ‪$freqMax = 0.1;
187 
191  public ‪$enableMetaphoneSearch = false;
192 
197 
201  public ‪$metaphoneContent = '';
202 
207 
213  public ‪$lexerObj;
214 
218  public ‪$flagBitMask;
219 
223  protected ‪$timeTracker;
224 
228  public function ‪__construct()
229  {
230  $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
231  // Indexer configuration from Extension Manager interface
232  $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
233  $this->tstamp_minAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
234  $this->tstamp_maxAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
235  $this->maxExternalFiles = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxExternalFiles'] ?? 5), 0, 1000);
236  $this->flagBitMask = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['flagBitMask'] ?? 0), 0, 255);
237  // Workaround: If the extension configuration was not updated yet, the value is not existing
238  $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
239  $this->storeMetaphoneInfoAsWords = !‪IndexedSearchUtility::isTableUsed('index_words') && ‪$this->enableMetaphoneSearch;
240  }
241 
242  /********************************
243  *
244  * Initialization
245  *
246  *******************************/
247 
252  public function ‪init(array $configuration = null)
253  {
254  if (is_array($configuration)) {
255  $this->conf = $configuration;
256  }
257  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
258  $this->‪setT3Hashes();
259  // Initialize external document parsers:
260  // Example configuration, see ext_localconf.php of this file!
261  if ($this->conf['index_externals']) {
263  }
264  // Initialize lexer (class that deconstructs the text into words):
265  $lexerObjectClassName = (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?? false) ? ‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : Lexer::class;
267  $lexer = GeneralUtility::makeInstance($lexerObjectClassName);
268  $this->lexerObj = $lexer;
269  $this->lexerObj->debug = $this->indexerConfig['debugMode'];
270  // Initialize metaphone hook:
271  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
272  if ($this->enableMetaphoneSearch && (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone'] ?? false)) {
274  ‪$metaphoneObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
275  $this->metaphoneObj = ‪$metaphoneObj;
276  $this->metaphoneObj->pObj = $this;
277  }
278  }
279 
286  public function ‪initializeExternalParsers()
287  {
288  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
289  $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
290  $this->external_parsers[$extension]->pObj = $this;
291  // Init parser and if it returns FALSE, unset its entry again:
292  if (!$this->external_parsers[$extension]->initParser($extension)) {
293  unset($this->external_parsers[$extension]);
294  }
295  }
296  }
297 
298  /********************************
299  *
300  * Indexing; TYPO3 pages (HTML content)
301  *
302  *******************************/
306  public function ‪indexTypo3PageContent()
307  {
308  $check = $this->‪checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
309  $is_grlist = $this->‪is_grlist_set($this->hash['phash']);
310  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
311  // Setting message:
312  if ($this->forceIndexing) {
313  $this->‪log_setTSlogMessage('Indexing needed, reason: Forced', LogLevel::NOTICE);
314  } elseif ($check > 0) {
315  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], LogLevel::NOTICE);
316  } else {
317  $this->‪log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', LogLevel::NOTICE);
318  }
319  // Divide into title,keywords,description and body:
320  $this->‪log_push('Split content', '');
321  $this->contentParts = $this->‪splitHTMLContent($this->conf['content']);
322  if ($this->conf['indexedDocTitle']) {
323  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
324  }
325  $this->‪log_pull();
326  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
327  $this->content_md5h = ‪IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
328  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
329  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
330  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
331  $checkCHash = $this->‪checkContentHash();
332  if (!is_array($checkCHash) || $check === 1) {
334  $this->‪log_push('Converting entities of content', '');
335  $this->‪charsetEntity2utf8($this->contentParts);
336  $this->‪log_pull();
337  // Splitting words
338  $this->‪log_push('Extract words from content', '');
339  $splitInWords = $this->‪processWordsInArrays($this->contentParts);
340  $this->‪log_pull();
341  // Analyze the indexed words.
342  $this->‪log_push('Analyze the extracted words', '');
343  $indexArr = $this->‪indexAnalyze($splitInWords);
344  $this->‪log_pull();
345  // Submitting page (phash) record
346  $this->‪log_push('Submitting page', '');
347  $this->‪submitPage();
348  $this->‪log_pull();
349  // Check words and submit to word list if not there
350  $this->‪log_push('Check word list and submit words', '');
351  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
352  $indexArr = $this->‪removePhashCollisions($indexArr);
353  $this->‪checkWordList($indexArr);
354  $this->‪submitWords($indexArr, $this->hash['phash']);
355  }
356  $this->‪log_pull();
357  // Set parsetime
358  $this->‪updateParsetime($this->hash['phash'], ‪IndexedSearchUtility::milliseconds() - $Pstart);
359  // Checking external files if configured for.
360  $this->‪log_push('Checking external files', '');
361  if ($this->conf['index_externals']) {
362  $this->‪extractLinks($this->conf['content']);
363  }
364  $this->‪log_pull();
365  } else {
366  // Update the timestamp
367  $this->‪updateTstamp($this->hash['phash'], $this->conf['mtime']);
368  $this->‪updateSetId($this->hash['phash']);
369  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
370  $this->‪update_grlist($checkCHash['phash'], $this->hash['phash']);
371  $this->‪updateRootline();
372  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
373  }
374  } else {
375  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
376  }
377  }
378 
386  public function ‪splitHTMLContent($content)
387  {
388  // divide head from body ( u-ouh :) )
389  $contentArr = ‪$this->defaultContentArray;
390  $contentArr['body'] = stristr($content, '<body') ?: '';
391  $headPart = substr($content, 0, -strlen($contentArr['body']));
392  // get title
393  $this->‪embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
394  $titleParts = explode(':', $contentArr['title'], 2);
395  $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
396  // get keywords and description metatags
397  if ($this->conf['index_metatags']) {
398  $meta = [];
399  $i = 0;
400  while ($this->‪embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
401  $i++;
402  }
403  // @todo The code below stops at first unset tag. Is that correct?
404  for ($i = 0; isset($meta[$i]); $i++) {
405  // decode HTML entities, meta tag content needs to be encoded later
406  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
407  if (stripos(($meta[$i]['name'] ?? ''), 'keywords') !== false) {
408  $contentArr['keywords'] .= ',' . $this->‪addSpacesToKeywordList($meta[$i]['content']);
409  }
410  if (stripos(($meta[$i]['name'] ?? ''), 'description') !== false) {
411  $contentArr['description'] .= ',' . $meta[$i]['content'];
412  }
413  }
414  }
415  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
416  $this->‪typoSearchTags($contentArr['body']);
417  // Get rid of unwanted sections (ie. scripting and style stuff) in body
418  $tagList = explode(',', $this->excludeSections);
419  foreach ($tagList as $tag) {
420  while ($this->‪embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
421  }
422  }
423  // remove tags, but first make sure we don't concatenate words by doing it
424  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
425  $contentArr['body'] = trim(strip_tags($contentArr['body']));
426  $contentArr['keywords'] = trim($contentArr['keywords']);
427  $contentArr['description'] = trim($contentArr['description']);
428  // Return array
429  return $contentArr;
430  }
431 
438  public function ‪getHTMLcharset($content)
439  {
440  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
441  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
442  return $reg2[1];
443  }
444  }
445 
446  return '';
447  }
448 
456  public function ‪convertHTMLToUtf8($content, $charset = '')
457  {
458  // Find charset:
459  $charset = $charset ?: $this->‪getHTMLcharset($content);
460  $charset = trim(strtolower($charset));
461  // Convert charset:
462  if ($charset && $charset !== 'utf-8') {
463  $content = mb_convert_encoding($content, 'utf-8', $charset);
464  }
465  // Convert entities, assuming document is now UTF-8:
466  return html_entity_decode($content);
467  }
468 
481  public function ‪embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
482  {
483  $endTag = '</' . $tagName . '>';
484  $startTag = '<' . $tagName;
485  // stristr used because we want a case-insensitive search for the tag.
486  $isTagInText = stristr($string, $startTag);
487  // if the tag was not found, return FALSE
488  if (!$isTagInText) {
489  return false;
490  }
491  [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
492  $afterTagInText = stristr($isTagInText, $endTag);
493  if ($afterTagInText) {
494  $stringBefore = substr($string, 0, (int)strpos(strtolower($string), strtolower($startTag)));
495  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
496  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
497  } else {
498  $tagContent = '';
499  $stringAfter = $isTagInText;
500  }
501  return true;
502  }
503 
510  public function ‪typoSearchTags(&$body)
511  {
512  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
513  $expBody = $expBody ?: [];
514  if (count($expBody) > 1) {
515  $body = '';
516  $prev = '';
517  foreach ($expBody as $val) {
518  $part = explode('-->', $val, 2);
519  if (trim($part[0]) === 'begin') {
520  $body .= $part[1];
521  $prev = '';
522  } elseif (trim($part[0]) === 'end') {
523  $body .= $prev;
524  } else {
525  $prev = $val;
526  }
527  }
528  return true;
529  }
530  return false;
531  }
532 
538  public function ‪extractLinks($content)
539  {
540  // Get links:
541  $list = $this->‪extractHyperLinks($content);
542  // Traverse links:
543  foreach ($list as $linkInfo) {
544  // Decode entities:
545  if ($linkInfo['localPath']) {
546  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
547  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
548  } else {
549  $linkSource = htmlspecialchars_decode($linkInfo['href']);
550  }
551  // Parse URL:
552  $qParts = parse_url($linkSource);
553  // Check for jumpurl (TYPO3 specific thing...)
554  if (($qParts['query'] ?? false) && str_contains($qParts['query'] ?? '', 'jumpurl=')) {
555  parse_str($qParts['query'], $getP);
556  $linkSource = $getP['jumpurl'];
557  $qParts = parse_url($linkSource);
558  }
559  if (!$linkInfo['localPath'] && ($qParts['scheme'] ?? false)) {
560  if ($this->indexerConfig['indexExternalURLs']) {
561  // Index external URL (http or otherwise)
562  $this->‪indexExternalUrl($linkSource);
563  }
564  } elseif (!($qParts['query'] ?? false)) {
565  $linkSource = urldecode($linkSource);
566  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
567  $localFile = $linkSource;
568  } else {
569  $localFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $linkSource);
570  }
571  if ($localFile && @is_file($localFile)) {
572  // Index local file:
573  if ($linkInfo['localPath']) {
574  $fI = pathinfo($linkSource);
575  $ext = strtolower($fI['extension']);
576  $this->‪indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
577  } else {
578  $this->‪indexRegularDocument($linkSource);
579  }
580  }
581  }
582  }
583  }
584 
592  public function ‪extractHyperLinks($html)
593  {
594  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
595  $htmlParts = $htmlParser->splitTags('a', $html);
596  $hyperLinksData = [];
597  foreach ($htmlParts as $index => $tagData) {
598  if ($index % 2 !== 0) {
599  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
600  $firstTagName = $htmlParser->getFirstTagName($tagData);
601  if (strtolower($firstTagName) === 'a') {
602  if (!empty($tagAttributes[0]['href']) && substr($tagAttributes[0]['href'], 0, 1) !== '#') {
603  $hyperLinksData[] = [
604  'tag' => $tagData,
605  'href' => $tagAttributes[0]['href'],
606  'localPath' => $this->‪createLocalPath(urldecode($tagAttributes[0]['href'])),
607  ];
608  }
609  }
610  }
611  }
612  return $hyperLinksData;
613  }
614 
621  public function ‪extractBaseHref($html)
622  {
623  $href = '';
624  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
625  $htmlParts = $htmlParser->splitTags('base', $html);
626  foreach ($htmlParts as $index => $tagData) {
627  if ($index % 2 !== 0) {
628  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
629  $firstTagName = $htmlParser->getFirstTagName($tagData);
630  if (strtolower($firstTagName) === 'base') {
631  $href = $tagAttributes[0]['href'];
632  if ($href) {
633  break;
634  }
635  }
636  }
637  }
638  return $href;
639  }
640 
641  /******************************************
642  *
643  * Indexing; external URL
644  *
645  ******************************************/
652  public function ‪indexExternalUrl($externalUrl)
653  {
654  // Get headers:
655  $urlHeaders = $this->‪getUrlHeaders($externalUrl);
656  if (is_array($urlHeaders) && stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
657  $content = ($this->indexExternalUrl_content = ‪GeneralUtility::getUrl($externalUrl));
658  if ((string)$content !== '') {
659  // Create temporary file:
660  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
661  ‪GeneralUtility::writeFile($tmpFile, $content);
662  // Index that file:
663  $this->‪indexRegularDocument($externalUrl, true, $tmpFile, 'html');
664  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
665  unlink($tmpFile);
666  }
667  }
668  }
669 
676  public function ‪getUrlHeaders(‪$url)
677  {
678  try {
679  $response = GeneralUtility::makeInstance(RequestFactory::class)->request(‪$url, 'HEAD');
680  $headers = $response->getHeaders();
681  $retVal = [];
682  foreach ($headers as $key => $value) {
683  $retVal[$key] = implode('', $value);
684  }
685  return $retVal;
686  } catch (\Exception $e) {
687  // fail silently if the HTTP request failed
688  return false;
689  }
690  }
691 
698  protected function ‪createLocalPath($sourcePath)
699  {
700  $pathFunctions = [
701  'createLocalPathUsingAbsRefPrefix',
702  'createLocalPathUsingDomainURL',
703  'createLocalPathFromAbsoluteURL',
704  'createLocalPathFromRelativeURL',
705  ];
706  foreach ($pathFunctions as $functionName) {
707  $localPath = $this->{$functionName}($sourcePath);
708  if ($localPath != '') {
709  break;
710  }
711  }
712  return $localPath;
713  }
714 
721  protected function ‪createLocalPathUsingDomainURL($sourcePath)
722  {
723  $localPath = '';
724  $baseURL = ‪$GLOBALS['TYPO3_REQUEST']->getAttribute('normalizedParams')->getSiteUrl();
725  $baseURLLength = strlen($baseURL);
726  if (str_starts_with($sourcePath, $baseURL)) {
727  $sourcePath = substr($sourcePath, $baseURLLength);
728  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
729  if (!self::isAllowedLocalFile($localPath)) {
730  $localPath = '';
731  }
732  }
733  return $localPath;
734  }
735 
743  protected function ‪createLocalPathUsingAbsRefPrefix($sourcePath)
744  {
745  $localPath = '';
746  if (isset(‪$GLOBALS['TSFE']) && ‪$GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
747  $absRefPrefix = ‪$GLOBALS['TSFE']->config['config']['absRefPrefix'];
748  $absRefPrefixLength = strlen($absRefPrefix);
749  if ($absRefPrefixLength > 0 && str_starts_with($sourcePath, $absRefPrefix)) {
750  $sourcePath = substr($sourcePath, $absRefPrefixLength);
751  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
752  if (!self::isAllowedLocalFile($localPath)) {
753  $localPath = '';
754  }
755  }
756  }
757  return $localPath;
758  }
759 
767  protected function ‪createLocalPathFromAbsoluteURL($sourcePath)
768  {
769  $localPath = '';
770  if (substr(($sourcePath[0] ?? ''), 0, 1) === '/') {
771  $sourcePath = substr($sourcePath, 1);
772  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
773  if (!self::isAllowedLocalFile($localPath)) {
774  $localPath = '';
775  }
776  }
777  return $localPath;
778  }
779 
786  protected function ‪createLocalPathFromRelativeURL($sourcePath)
787  {
788  $localPath = '';
789  if (self::isRelativeURL($sourcePath)) {
790  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
791  if (!self::isAllowedLocalFile($localPath)) {
792  $localPath = '';
793  }
794  }
795  return $localPath;
796  }
797 
804  protected static function ‪isRelativeURL(‪$url)
805  {
806  $urlParts = @parse_url(‪$url);
807  return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && substr(($urlParts['path'][0] ?? ''), 0, 1) !== '/';
808  }
809 
816  protected static function ‪isAllowedLocalFile($filePath)
817  {
818  $filePath = GeneralUtility::resolveBackPath($filePath);
819  $insideWebPath = str_starts_with($filePath, ‪Environment::getPublicPath());
820  $isFile = is_file($filePath);
821  return $insideWebPath && $isFile;
822  }
823 
824  /******************************************
825  *
826  * Indexing; external files (PDF, DOC, etc)
827  *
828  ******************************************/
837  public function ‪indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
838  {
839  // Init
840  $fI = pathinfo($file);
841  $ext = $altExtension ?: strtolower($fI['extension']);
842  // Create abs-path:
843  if (!$contentTmpFile) {
844  if (!‪PathUtility::isAbsolutePath($file)) {
845  // Relative, prepend public web path:
846  $absFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $file);
847  } else {
848  // Absolute, pass-through:
849  $absFile = $file;
850  }
851  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
852  } else {
853  $absFile = $contentTmpFile;
854  }
855  // Indexing the document:
856  if ($absFile && @is_file($absFile)) {
857  if ($this->external_parsers[$ext] ?? false) {
858  $fileInfo = stat($absFile);
859  $cParts = $this->‪fileContentParts($ext, $absFile);
860  foreach ($cParts as $cPKey) {
861  $this->internal_log = [];
862  $this->‪log_push('Index: ' . str_replace('.', '_', ‪PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
864  $subinfo = ['key' => $cPKey];
865  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
866  $phash_arr = ($this->file_phash_arr = $this->‪setExtHashes($file, $subinfo));
867  $check = $this->‪checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
868  if ($check > 0 || $force) {
869  if ($check > 0) {
870  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], LogLevel::NOTICE);
871  } else {
872  $this->‪log_setTSlogMessage('Indexing forced by flag', LogLevel::NOTICE);
873  }
874  // Check external file counter:
875  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
876  // Divide into title,keywords,description and body:
877  $this->‪log_push('Split content', '');
878  ‪$contentParts = $this->‪readFileContent($ext, $absFile, $cPKey);
879  $this->‪log_pull();
880  if (is_array(‪$contentParts)) {
881  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
883  if ($this->‪checkExternalDocContentHash($phash_arr['phash_grouping'], ‪$content_md5h) || $force) {
884  // Increment counter:
885  $this->externalFileCounter++;
886  // Splitting words
887  $this->‪log_push('Extract words from content', '');
888  $splitInWords = $this->‪processWordsInArrays(‪$contentParts);
889  $this->‪log_pull();
890  // Analyze the indexed words.
891  $this->‪log_push('Analyze the extracted words', '');
892  $indexArr = $this->‪indexAnalyze($splitInWords);
893  $this->‪log_pull();
894  // Submitting page (phash) record
895  $this->‪log_push('Submitting page', '');
896  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
897  $this->‪submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], ‪$content_md5h, ‪$contentParts);
898  $this->‪log_pull();
899  // Check words and submit to word list if not there
900  $this->‪log_push('Check word list and submit words', '');
901  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
902  $indexArr = $this->‪removePhashCollisions($indexArr);
903  $this->‪checkWordList($indexArr);
904  $this->‪submitWords($indexArr, $phash_arr['phash']);
905  }
906  $this->‪log_pull();
907  // Set parsetime
908  $this->‪updateParsetime($phash_arr['phash'], ‪IndexedSearchUtility::milliseconds() - $Pstart);
909  } else {
910  // Update the timestamp
911  $this->‪updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
912  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . ‪$content_md5h . ', has not changed. Timestamp updated.');
913  }
914  } else {
915  $this->‪log_setTSlogMessage('Could not index file! Unsupported extension.');
916  }
917  } else {
918  $this->‪log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
919  }
920  } else {
921  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
922  }
923  // Checking and setting sections:
924  $this->‪submitFile_section($phash_arr['phash']);
925  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
926  $this->‪log_pull();
927  }
928  } else {
929  $this->‪log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
930  }
931  } else {
932  $this->‪log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
933  }
934  }
935 
945  public function ‪readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
946  {
947  $contentArray = null;
948  // Consult relevant external document parser:
949  if (is_object($this->external_parsers[$fileExtension])) {
950  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
951  }
952  return $contentArray;
953  }
954 
962  public function ‪fileContentParts($ext, $absFile)
963  {
964  $cParts = [0];
965  // Consult relevant external document parser:
966  if (is_object($this->external_parsers[$ext])) {
967  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
968  }
969  return $cParts;
970  }
971 
979  public function ‪splitRegularContent($content)
980  {
981  $contentArr = ‪$this->defaultContentArray;
982  $contentArr['body'] = $content;
983  return $contentArr;
984  }
985 
986  /**********************************
987  *
988  * Analysing content, Extracting words
989  *
990  **********************************/
996  public function ‪charsetEntity2utf8(&$contentArr)
997  {
998  // Convert charset if necessary
999  foreach ($contentArr as $key => $value) {
1000  if ((string)$contentArr[$key] !== '') {
1001  // decode all numeric / html-entities in the string to real characters:
1002  $contentArr[$key] = html_entity_decode($contentArr[$key]);
1003  }
1004  }
1005  }
1013  public function ‪processWordsInArrays($contentArr)
1014  {
1015  // split all parts to words
1016  foreach ($contentArr as $key => $value) {
1017  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1018  }
1019  // For title, keywords, and description we don't want duplicates:
1020  $contentArr['title'] = array_unique($contentArr['title']);
1021  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1022  $contentArr['description'] = array_unique($contentArr['description']);
1023  // Return modified array:
1024  return $contentArr;
1025  }
1033  public function ‪bodyDescription($contentArr)
1034  {
1035  $bodyDescription = '';
1036  // Setting description
1037  $maxL = ‪MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1038  if ($maxL) {
1039  $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1040  // Shorten the string. If the database has the wrong character set,
1041  // the string is probably truncated again.
1042  $bodyDescription = \mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1043  }
1044  return $bodyDescription;
1045  }
1046 
1053  public function ‪indexAnalyze($content)
1054  {
1055  $indexArr = [];
1056  $this->‪analyzeHeaderinfo($indexArr, $content, 'title', 7);
1057  $this->‪analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1058  $this->‪analyzeHeaderinfo($indexArr, $content, 'description', 5);
1059  $this->‪analyzeBody($indexArr, $content);
1060  return $indexArr;
1061  }
1062 
1071  public function ‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1072  {
1073  foreach ($content[$key] as $val) {
1074  $val = mb_substr($val, 0, 60);
1075  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1076  if (!isset($retArr[$val])) {
1077  // Word ID (wid)
1078  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1079  // Metaphone value is also 60 only chars long
1080  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1081  $retArr[$val]['metaphone'] = $metaphone;
1082  }
1083  // Build metaphone fulltext string (can be used for fulltext indexing)
1084  if ($this->storeMetaphoneInfoAsWords) {
1085  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1086  }
1087  // Priority used for flagBitMask feature (see extension configuration)
1088  $retArr[$val]['cmp'] = ($retArr[$val]['cmp'] ?? 0) | 2 ** $offset;
1089  if (!($retArr[$val]['count'] ?? false)) {
1090  $retArr[$val]['count'] = 0;
1091  }
1092 
1093  // Increase number of occurrences
1094  $retArr[$val]['count']++;
1095  $this->wordcount++;
1096  }
1097  }
1098 
1105  public function ‪analyzeBody(&$retArr, $content)
1106  {
1107  foreach ($content['body'] as $key => $val) {
1108  $val = substr($val, 0, 60);
1109  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1110  if (!isset($retArr[$val])) {
1111  // First occurrence (used for ranking results)
1112  $retArr[$val]['first'] = $key;
1113  // Word ID (wid)
1114  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1115  // Metaphone value is also only 60 chars long
1116  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1117  $retArr[$val]['metaphone'] = $metaphone;
1118  }
1119  // Build metaphone fulltext string (can be used for fulltext indexing)
1120  if ($this->storeMetaphoneInfoAsWords) {
1121  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1122  }
1123  if (!($retArr[$val]['count'] ?? false)) {
1124  $retArr[$val]['count'] = 0;
1125  }
1126 
1127  // Increase number of occurrences
1128  $retArr[$val]['count']++;
1129  $this->wordcount++;
1130  }
1131  }
1132 
1140  public function ‪metaphone($word, $returnRawMetaphoneValue = false)
1141  {
1142  if (is_object($this->metaphoneObj)) {
1143  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1144  } else {
1145  // Use native PHP functions instead of advanced doubleMetaphone class
1146  $metaphoneRawValue = ‪metaphone($word);
1147  }
1148  if ($returnRawMetaphoneValue) {
1149  $result = $metaphoneRawValue;
1150  } elseif ($metaphoneRawValue !== '') {
1151  // Create hash and return integer
1152  $result = ‪IndexedSearchUtility::md5inthash($metaphoneRawValue);
1153  } else {
1154  $result = 0;
1155  }
1156  return $result;
1157  }
1158 
1159  /********************************
1160  *
1161  * SQL; TYPO3 Pages
1162  *
1163  *******************************/
1167  public function ‪submitPage()
1168  {
1169  // Remove any current data for this phash:
1170  $this->‪removeOldIndexedPages($this->hash['phash']);
1171  // setting new phash_row
1172  ‪$fields = [
1173  'phash' => $this->hash['phash'],
1174  'phash_grouping' => $this->hash['phash_grouping'],
1175  'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1176  'contentHash' => $this->content_md5h,
1177  'data_page_id' => $this->conf['id'],
1178  'data_page_type' => $this->conf['type'],
1179  'data_page_mp' => $this->conf['MP'],
1180  'gr_list' => $this->conf['gr_list'],
1181  'item_type' => 0,
1182  // TYPO3 page
1183  'item_title' => $this->contentParts['title'],
1184  'item_description' => $this->‪bodyDescription($this->contentParts),
1185  'item_mtime' => (int)$this->conf['mtime'],
1186  'item_size' => strlen($this->conf['content']),
1187  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1188  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1189  'item_crdate' => $this->conf['crdate'],
1190  // Creation date of page
1191  'sys_language_uid' => $this->conf['sys_language_uid'],
1192  // Sys language uid of the page. Should reflect which language it DOES actually display!
1193  'externalUrl' => 0,
1194  'recordUid' => (int)$this->conf['recordUid'],
1195  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1196  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1197  ];
1198  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1199  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1200  ->getConnectionForTable('index_phash');
1201  $connection->insert(
1202  'index_phash',
1203  ‪$fields
1204  );
1205  }
1206  // PROCESSING index_section
1207  $this->‪submit_section($this->hash['phash'], $this->hash['phash']);
1208  // PROCESSING index_grlist
1209  $this->‪submit_grlist($this->hash['phash'], $this->hash['phash']);
1210  // PROCESSING index_fulltext
1211  ‪$fields = [
1212  'phash' => $this->hash['phash'],
1213  'fulltextdata' => implode(' ', $this->contentParts),
1214  'metaphonedata' => ‪$this->metaphoneContent,
1215  ];
1216  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1217  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1218  }
1219  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1220  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1221  ->getConnectionForTable('index_fulltext');
1222  $connection->insert('index_fulltext', ‪$fields);
1223  }
1224  // PROCESSING index_debug
1225  if ($this->indexerConfig['debugMode']) {
1226  ‪$fields = [
1227  'phash' => $this->hash['phash'],
1228  'debuginfo' => json_encode([
1229  'external_parsers initialized' => array_keys($this->external_parsers),
1230  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1231  'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1232  'logs' => $this->internal_log,
1233  'lexer' => $this->lexerObj->debugString,
1234  ]),
1235  ];
1236  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1237  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1238  ->getConnectionForTable('index_debug');
1239  $connection->insert('index_debug', ‪$fields);
1240  }
1241  }
1242  }
1243 
1251  public function ‪submit_grlist(‪$hash, $phash_x)
1252  {
1253  // Setting the gr_list record
1254  ‪$fields = [
1255  'phash' => ‪$hash,
1256  'phash_x' => $phash_x,
1257  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1258  'gr_list' => $this->conf['gr_list'],
1259  ];
1260  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1261  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1262  ->getConnectionForTable('index_grlist');
1263  $connection->insert('index_grlist', ‪$fields);
1264  }
1265  }
1266 
1274  public function ‪submit_section(‪$hash, $hash_t3)
1275  {
1276  ‪$fields = [
1277  'phash' => ‪$hash,
1278  'phash_t3' => $hash_t3,
1279  'page_id' => (int)$this->conf['id'],
1280  ];
1282  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1283  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1284  ->getConnectionForTable('index_section');
1285  $connection->insert('index_section', ‪$fields);
1286  }
1287  }
1288 
1294  public function ‪removeOldIndexedPages($phash)
1295  {
1296  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1297  // there can be nothing else than 1-1 relations here.
1298  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1299  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1300  foreach ($tableArray as $table) {
1302  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1303  }
1304  }
1305 
1306  // Removing all index_section records with hash_t3 set to this hash (this includes such
1307  // records set for external media on the page as well!). The re-insert of these records
1308  // are done in indexRegularDocument($file).
1309  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1310  $connectionPool->getConnectionForTable('index_section')
1311  ->delete('index_section', ['phash_t3' => (int)$phash]);
1312  }
1313  }
1314 
1315  /********************************
1316  *
1317  * SQL; External media
1318  *
1319  *******************************/
1333  public function ‪submitFilePage(‪$hash, $file, $subinfo, $ext, $mtime, $ctime, $size, ‪$content_md5h, ‪$contentParts)
1334  {
1335  // Find item Type:
1336  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1337  $storeItemType = $storeItemType ?: $ext;
1338  // Remove any current data for this phash:
1339  $this->‪removeOldIndexedFiles(‪$hash['phash']);
1340  // Split filename:
1341  $fileParts = parse_url($file);
1342  // Setting new
1343  ‪$fields = [
1344  'phash' => ‪$hash['phash'],
1345  'phash_grouping' => ‪$hash['phash_grouping'],
1346  'static_page_arguments' => json_encode($subinfo),
1347  'contentHash' => ‪$content_md5h,
1348  'data_filename' => $file,
1349  'item_type' => $storeItemType,
1350  'item_title' => trim(‪$contentParts['title']) ?: ‪PathUtility::basename($file),
1351  'item_description' => $this->‪bodyDescription(‪$contentParts),
1352  'item_mtime' => $mtime,
1353  'item_size' => $size,
1354  'item_crdate' => $ctime,
1355  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1356  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1357  'gr_list' => $this->conf['gr_list'],
1358  'externalUrl' => ($fileParts['scheme'] ?? false) ? 1 : 0,
1359  'recordUid' => (int)$this->conf['recordUid'],
1360  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1361  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1362  'sys_language_uid' => (int)$this->conf['sys_language_uid'],
1363  ];
1364  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1365  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1366  ->getConnectionForTable('index_phash');
1367  $connection->insert(
1368  'index_phash',
1369  ‪$fields
1370  );
1371  }
1372  // PROCESSING index_fulltext
1373  ‪$fields = [
1374  'phash' => ‪$hash['phash'],
1375  'fulltextdata' => implode(' ', ‪$contentParts),
1376  'metaphonedata' => ‪$this->metaphoneContent,
1377  ];
1378  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1379  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1380  }
1381  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1382  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1383  ->getConnectionForTable('index_fulltext');
1384  $connection->insert('index_fulltext', ‪$fields);
1385  }
1386  // PROCESSING index_debug
1387  if ($this->indexerConfig['debugMode']) {
1388  ‪$fields = [
1389  'phash' => ‪$hash['phash'],
1390  'debuginfo' => json_encode([
1391  'static_page_arguments' => $subinfo,
1392  'contentParts' => array_merge(‪$contentParts, ['body' => substr(‪$contentParts['body'], 0, 1000)]),
1393  'logs' => $this->internal_log,
1394  'lexer' => $this->lexerObj->debugString,
1395  ]),
1396  ];
1397  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1398  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1399  ->getConnectionForTable('index_debug');
1400  $connection->insert('index_debug', ‪$fields);
1401  }
1402  }
1403  }
1404 
1410  public function ‪submitFile_grlist(‪$hash)
1411  {
1412  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1413  if (!‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1414  return;
1415  }
1416 
1417  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1418  ->getQueryBuilderForTable('index_grlist');
1419  $count = (int)$queryBuilder->count('*')
1420  ->from('index_grlist')
1421  ->where(
1422  $queryBuilder->expr()->eq(
1423  'phash',
1424  $queryBuilder->createNamedParameter(‪$hash, ‪Connection::PARAM_INT)
1425  ),
1426  $queryBuilder->expr()->or(
1427  $queryBuilder->expr()->eq(
1428  'hash_gr_list',
1429  $queryBuilder->createNamedParameter(
1430  ‪IndexedSearchUtility::md5inthash($this->defaultGrList),
1432  )
1433  ),
1434  $queryBuilder->expr()->eq(
1435  'hash_gr_list',
1436  $queryBuilder->createNamedParameter(
1437  ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1439  )
1440  )
1441  )
1442  )
1443  ->executeQuery()
1444  ->fetchOne();
1445 
1446  if ($count === 0) {
1448  }
1449  }
1450 
1456  public function ‪submitFile_section(‪$hash)
1457  {
1458  // Testing if there is already a section
1459  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1460  return;
1461  }
1462 
1463  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1464  ->getQueryBuilderForTable('index_section');
1465  $count = (int)$queryBuilder->count('phash')
1466  ->from('index_section')
1467  ->where(
1468  $queryBuilder->expr()->eq(
1469  'phash',
1470  $queryBuilder->createNamedParameter(‪$hash, ‪Connection::PARAM_INT)
1471  ),
1472  $queryBuilder->expr()->eq(
1473  'page_id',
1474  $queryBuilder->createNamedParameter($this->conf['id'], ‪Connection::PARAM_INT)
1475  )
1476  )
1477  ->executeQuery()
1478  ->fetchOne();
1479 
1480  if ($count === 0) {
1481  $this->‪submit_section(‪$hash, $this->hash['phash']);
1482  }
1483  }
1484 
1490  public function ‪removeOldIndexedFiles($phash)
1491  {
1492  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1493  // Removing old registrations for tables.
1494  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1495  foreach ($tableArray as $table) {
1496  if (!‪IndexedSearchUtility::isTableUsed($table)) {
1497  continue;
1498  }
1499  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1500  }
1501  }
1502 
1503  /********************************
1504  *
1505  * SQL Helper functions
1506  *
1507  *******************************/
1516  public function ‪checkMtimeTstamp($mtime, $phash)
1517  {
1518  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1519  // Not indexed (not in index_phash)
1520  $result = 4;
1521  } else {
1522  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1523  ->select(
1524  ['item_mtime', 'tstamp'],
1525  'index_phash',
1526  ['phash' => (int)$phash],
1527  [],
1528  [],
1529  1
1530  )
1531  ->fetchAssociative();
1532  // If there was an indexing of the page...:
1533  if (!empty($row)) {
1534  if ($this->tstamp_maxAge && ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_maxAge) {
1535  // If max age is exceeded, index the page
1536  // The configured max-age was exceeded for the document and thus it's indexed.
1537  $result = 1;
1538  } else {
1539  if (!$this->tstamp_minAge || ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_minAge) {
1540  // if minAge is not set or if minAge is exceeded, consider at mtime
1541  if ($mtime) {
1542  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1543  if ($row['item_mtime'] != $mtime) {
1544  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1545  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1546  $result = 2;
1547  } else {
1548  // mtime matched the document, so no changes detected and no content updated
1549  $result = -1;
1550  if ($this->tstamp_maxAge) {
1551  $this->‪log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - ‪$GLOBALS['EXEC_TIME']) . ' seconds to expire time).', LogLevel::WARNING);
1552  } else {
1553  $this->‪updateTstamp($phash);
1554  $this->‪log_setTSlogMessage('mtime matched, timestamp updated.', LogLevel::NOTICE);
1555  }
1556  }
1557  } else {
1558  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1559  $result = 3;
1560  }
1561  } else {
1562  // The minimum age was not exceeded
1563  $result = -2;
1564  }
1565  }
1566  } else {
1567  // Page has never been indexed (is not represented in the index_phash table).
1568  $result = 4;
1569  }
1570  }
1571  return $result;
1572  }
1573 
1579  public function ‪checkContentHash()
1580  {
1581  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1582  $result = true;
1583  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1584  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1585  ->select(
1586  ['phash'],
1587  'index_phash',
1588  [
1589  'phash_grouping' => (int)$this->hash['phash_grouping'],
1590  'contentHash' => (int)$this->content_md5h,
1591  ],
1592  [],
1593  [],
1594  1
1595  )
1596  ->fetchAssociative();
1597 
1598  if (!empty($row)) {
1599  $result = $row;
1600  }
1601  }
1602  return $result;
1603  }
1604 
1613  public function ‪checkExternalDocContentHash($hashGr, ‪$content_md5h)
1614  {
1615  $result = true;
1616  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1617  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1618  ->getConnectionForTable('index_phash')
1619  ->count(
1620  '*',
1621  'index_phash',
1622  [
1623  'phash_grouping' => (int)$hashGr,
1624  'contentHash' => (int)‪$content_md5h,
1625  ]
1626  );
1627 
1628  $result = $count === 0;
1629  }
1630  return $result;
1631  }
1632 
1639  public function ‪is_grlist_set($phash_x)
1640  {
1641  $result = false;
1642  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1643  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1644  ->getConnectionForTable('index_grlist')
1645  ->count(
1646  'phash_x',
1647  'index_grlist',
1648  ['phash_x' => (int)$phash_x]
1649  );
1650 
1651  $result = $count > 0;
1652  }
1653  return $result;
1654  }
1655 
1663  public function ‪update_grlist($phash, $phash_x)
1664  {
1665  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1666  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1667  ->getConnectionForTable('index_grlist')
1668  ->count(
1669  'phash',
1670  'index_grlist',
1671  [
1672  'phash' => (int)$phash,
1673  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1674  ]
1675  );
1676 
1677  if ($count === 0) {
1678  $this->‪submit_grlist($phash, $phash_x);
1679  $this->‪log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', LogLevel::NOTICE);
1680  }
1681  }
1682  }
1683 
1690  public function ‪updateTstamp($phash, $mtime = 0)
1691  {
1692  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1693  return;
1694  }
1695 
1696  $updateFields = [
1697  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1698  ];
1699 
1700  if ($mtime) {
1701  $updateFields['item_mtime'] = (int)$mtime;
1702  }
1703 
1704  GeneralUtility::makeInstance(ConnectionPool::class)
1705  ->getConnectionForTable('index_phash')
1706  ->update(
1707  'index_phash',
1708  $updateFields,
1709  [
1710  'phash' => (int)$phash,
1711  ]
1712  );
1713  }
1714 
1720  public function ‪updateSetId($phash)
1721  {
1722  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1723  return;
1724  }
1725 
1726  GeneralUtility::makeInstance(ConnectionPool::class)
1727  ->getConnectionForTable('index_phash')
1728  ->update(
1729  'index_phash',
1730  [
1731  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1732  ],
1733  [
1734  'phash' => (int)$phash,
1735  ]
1736  );
1737  }
1738 
1745  public function ‪updateParsetime($phash, $parsetime)
1746  {
1747  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1748  return;
1749  }
1750 
1751  GeneralUtility::makeInstance(ConnectionPool::class)
1752  ->getConnectionForTable('index_phash')
1753  ->update(
1754  'index_phash',
1755  [
1756  'parsetime' => (int)$parsetime,
1757  ],
1758  [
1759  'phash' => (int)$phash,
1760  ]
1761  );
1762  }
1763 
1767  public function ‪updateRootline()
1768  {
1769  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1770  return;
1771  }
1772 
1773  $updateFields = [];
1774  $this->‪getRootLineFields($updateFields);
1775 
1776  GeneralUtility::makeInstance(ConnectionPool::class)
1777  ->getConnectionForTable('index_section')
1778  ->update(
1779  'index_section',
1780  $updateFields,
1781  [
1782  'page_id' => (int)$this->conf['id'],
1783  ]
1784  );
1785  }
1793  public function ‪getRootLineFields(array &$fieldArray)
1794  {
1795  $fieldArray['rl0'] = (int)($this->conf['rootline_uids'][0] ?? 0);
1796  $fieldArray['rl1'] = (int)($this->conf['rootline_uids'][1] ?? 0);
1797  $fieldArray['rl2'] = (int)($this->conf['rootline_uids'][2] ?? 0);
1798  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
1799  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1800  }
1801  }
1802 
1803  /********************************
1804  *
1805  * SQL; Submitting words
1806  *
1807  *******************************/
1813  public function ‪checkWordList($wordListArray)
1814  {
1815  if (!‪IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
1816  return;
1817  }
1818 
1819  $wordListArrayCount = count($wordListArray);
1820  $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
1821 
1822  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
1823  $count = (int)$queryBuilder->count('baseword')
1824  ->from('index_words')
1825  ->where(
1826  $queryBuilder->expr()->in(
1827  'wid',
1828  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1829  )
1830  )
1831  ->executeQuery()
1832  ->fetchOne();
1833 
1834  if ($count !== $wordListArrayCount) {
1835  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
1836  $queryBuilder = $connection->createQueryBuilder();
1837 
1838  $result = $queryBuilder->select('wid')
1839  ->from('index_words')
1840  ->where(
1841  $queryBuilder->expr()->in(
1842  'wid',
1843  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1844  )
1845  )
1846  ->executeQuery();
1847 
1848  $this->‪log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), LogLevel::NOTICE);
1849  while ($row = $result->fetchAssociative()) {
1850  foreach ($wordListArray as $baseword => $wordData) {
1851  if ($wordData['hash'] === $row['wid']) {
1852  unset($wordListArray[$baseword]);
1853  }
1854  }
1855  }
1856 
1857  foreach ($wordListArray as $key => $val) {
1858  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
1859  // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
1860  // this is not a problem.
1861  $connection->insert(
1862  'index_words',
1863  [
1864  'wid' => $val['hash'],
1865  'baseword' => $key,
1866  'metaphone' => $val['metaphone'],
1867  ]
1868  );
1869  }
1870  }
1871  }
1872 
1879  public function ‪submitWords($wordList, $phash)
1880  {
1881  if (!‪IndexedSearchUtility::isTableUsed('index_rel')) {
1882  return;
1883  }
1884  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1885  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
1886  $result = $queryBuilder->select('wid')
1887  ->from('index_words')
1888  ->where(
1889  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, ‪Connection::PARAM_INT))
1890  )
1891  ->groupBy('wid')
1892  ->executeQuery();
1893 
1894  $stopWords = [];
1895  while ($row = $result->fetchAssociative()) {
1896  $stopWords[$row['wid']] = $row;
1897  }
1898 
1899  $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
1900 
1901  ‪$fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1902  $rows = [];
1903  foreach ($wordList as $val) {
1904  if (isset($stopWords[$val['hash']])) {
1905  continue;
1906  }
1907  $rows[] = [
1908  (int)$phash,
1909  (int)$val['hash'],
1910  (int)$val['count'],
1911  (int)($val['first'] ?? 0),
1912  $this->‪freqMap($val['count'] / $this->wordcount),
1913  ($val['cmp'] ?? 0) & $this->flagBitMask,
1914  ];
1915  }
1916 
1917  if (!empty($rows)) {
1918  $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, ‪$fields);
1919  }
1920  }
1921 
1929  public function ‪freqMap($freq)
1930  {
1931  $mapFactor = $this->freqMax * 100 * ‪$this->freqRange;
1932  if ($freq <= 1) {
1933  $newFreq = $freq * $mapFactor;
1934  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
1935  } else {
1936  $newFreq = $freq / $mapFactor;
1937  }
1938  return (int)$newFreq;
1939  }
1940 
1941  /********************************
1942  *
1943  * Hashing
1944  *
1945  *******************************/
1949  public function ‪setT3Hashes()
1950  {
1951  // Set main array:
1952  $hArray = [
1953  'id' => (int)$this->conf['id'],
1954  'type' => (int)$this->conf['type'],
1955  'sys_lang' => (int)$this->conf['sys_language_uid'],
1956  'MP' => (string)$this->conf['MP'],
1957  'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1958  ];
1959  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1960  $this->hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1961  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1962  $hArray['gr_list'] = (string)$this->conf['gr_list'];
1963  $this->hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1964  }
1965 
1973  public function ‪setExtHashes($file, $subinfo = [])
1974  {
1975  // Set main array:
1976  ‪$hash = [];
1977  $hArray = [
1978  'file' => $file,
1979  ];
1980  // Set grouping hash:
1981  ‪$hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1982  // Add subinfo
1983  $hArray['subinfo'] = $subinfo;
1984  ‪$hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1985  return ‪$hash;
1986  }
1987 
1988  /*********************************
1989  *
1990  * Internal logging functions
1991  *
1992  *********************************/
1999  public function ‪log_push($msg, $key)
2000  {
2001  $this->timeTracker->push($msg, $key);
2002  }
2003 
2007  public function ‪log_pull()
2008  {
2009  $this->timeTracker->pull();
2010  }
2011 
2018  public function ‪log_setTSlogMessage($msg, $logLevel = LogLevel::INFO)
2019  {
2020  $this->timeTracker->setTSlogMessage($msg, $logLevel);
2021  $this->internal_log[] = $msg;
2022  }
2023 
2032  protected function ‪addSpacesToKeywordList($keywordList)
2033  {
2034  $keywords = GeneralUtility::trimExplode(',', $keywordList);
2035  return ' ' . implode(', ', $keywords) . ' ';
2036  }
2037 
2045  private function ‪removePhashCollisions(array $wordList): array
2046  {
2047  $uniquePhashes = [];
2048  foreach ($wordList as $baseword => $wordData) {
2049  if (in_array($wordData['hash'], $uniquePhashes, true)) {
2050  unset($wordList[$baseword]);
2051  continue;
2052  }
2053  $uniquePhashes[] = $wordData['hash'];
2054  }
2055  return $wordList;
2056  }
2057 }
‪TYPO3\CMS\IndexedSearch\Indexer\splitHTMLContent
‪array splitHTMLContent($content)
Definition: Indexer.php:359
‪TYPO3\CMS\IndexedSearch\Indexer\updateParsetime
‪updateParsetime($phash, $parsetime)
Definition: Indexer.php:1718
‪TYPO3\CMS\IndexedSearch\Indexer\$lexerObj
‪TYPO3 CMS IndexedSearch Lexer $lexerObj
Definition: Indexer.php:188
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:27
‪TYPO3\CMS\Core\Database\Connection\PARAM_INT
‪const PARAM_INT
Definition: Connection.php:46
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingDomainURL
‪string createLocalPathUsingDomainURL($sourcePath)
Definition: Indexer.php:694
‪TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
Definition: DoubleMetaPhoneUtility.php:27
‪TYPO3\CMS\Core\Utility\PathUtility\isAbsolutePath
‪static isAbsolutePath(string $path)
Definition: PathUtility.php:286
‪TYPO3\CMS\IndexedSearch\Indexer\submit_grlist
‪submit_grlist($hash, $phash_x)
Definition: Indexer.php:1224
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultContentArray
‪array $defaultContentArray
Definition: Indexer.php:100
‪TYPO3\CMS\Core\Html\HtmlParser
Definition: HtmlParser.php:26
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingAbsRefPrefix
‪string createLocalPathUsingAbsRefPrefix($sourcePath)
Definition: Indexer.php:716
‪TYPO3\CMS\IndexedSearch\Indexer\indexRegularDocument
‪indexRegularDocument($file, $force=false, $contentTmpFile='', $altExtension='')
Definition: Indexer.php:810
‪TYPO3\CMS\IndexedSearch\Indexer\charsetEntity2utf8
‪charsetEntity2utf8(&$contentArr)
Definition: Indexer.php:969
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:47
‪TYPO3\CMS\IndexedSearch\Indexer\$externalFileCounter
‪int $externalFileCounter
Definition: Indexer.php:113
‪TYPO3\CMS\IndexedSearch\Indexer\updateSetId
‪updateSetId($phash)
Definition: Indexer.php:1693
‪TYPO3\CMS\IndexedSearch\Indexer\isAllowedLocalFile
‪static bool isAllowedLocalFile($filePath)
Definition: Indexer.php:789
‪TYPO3\CMS\IndexedSearch\Indexer\$external_parsers
‪array $external_parsers
Definition: Indexer.php:61
‪TYPO3\CMS\IndexedSearch\Indexer\indexAnalyze
‪array indexAnalyze($content)
Definition: Indexer.php:1026
‪TYPO3\CMS\IndexedSearch\Indexer\$indexerConfig
‪array $indexerConfig
Definition: Indexer.php:123
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static getPublicPath()
Definition: Environment.php:187
‪TYPO3\CMS\IndexedSearch\Indexer\isRelativeURL
‪static bool isRelativeURL($url)
Definition: Indexer.php:777
‪TYPO3\CMS\IndexedSearch\Indexer\$storeMetaphoneInfoAsWords
‪bool $storeMetaphoneInfoAsWords
Definition: Indexer.php:173
‪TYPO3\CMS\IndexedSearch\Indexer\indexExternalUrl
‪indexExternalUrl($externalUrl)
Definition: Indexer.php:625
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeHeaderinfo
‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
Definition: Indexer.php:1044
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneObj
‪DoubleMetaPhoneUtility $metaphoneObj
Definition: Indexer.php:182
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedPages
‪removeOldIndexedPages($phash)
Definition: Indexer.php:1267
‪TYPO3\CMS\IndexedSearch\Indexer\log_pull
‪log_pull()
Definition: Indexer.php:1980
‪TYPO3\CMS\IndexedSearch\Indexer\$freqMax
‪float $freqMax
Definition: Indexer.php:165
‪TYPO3\CMS\IndexedSearch\Indexer\checkContentHash
‪mixed checkContentHash()
Definition: Indexer.php:1552
‪TYPO3\CMS\IndexedSearch\Indexer\convertHTMLToUtf8
‪string convertHTMLToUtf8($content, $charset='')
Definition: Indexer.php:429
‪$fields
‪$fields
Definition: pages.php:5
‪TYPO3\CMS\IndexedSearch\Indexer\bodyDescription
‪string bodyDescription($contentArr)
Definition: Indexer.php:1006
‪TYPO3\CMS\IndexedSearch\Indexer\update_grlist
‪update_grlist($phash, $phash_x)
Definition: Indexer.php:1636
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static basename(string $path)
Definition: PathUtility.php:219
‪TYPO3\CMS\IndexedSearch\Indexer\freqMap
‪int freqMap($freq)
Definition: Indexer.php:1902
‪TYPO3\CMS\IndexedSearch\Indexer\$content_md5h
‪int $content_md5h
Definition: Indexer.php:147
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromAbsoluteURL
‪string createLocalPathFromAbsoluteURL($sourcePath)
Definition: Indexer.php:740
‪TYPO3\CMS\IndexedSearch\Indexer\$forceIndexing
‪bool $forceIndexing
Definition: Indexer.php:94
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_section
‪submitFile_section($hash)
Definition: Indexer.php:1429
‪TYPO3\CMS\IndexedSearch\Indexer\$file_phash_arr
‪array $file_phash_arr
Definition: Indexer.php:135
‪TYPO3\CMS\IndexedSearch\Indexer\$wordcount
‪int $wordcount
Definition: Indexer.php:109
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedFiles
‪removeOldIndexedFiles($phash)
Definition: Indexer.php:1463
‪TYPO3\CMS\IndexedSearch\Indexer\updateRootline
‪updateRootline()
Definition: Indexer.php:1740
‪TYPO3\CMS\IndexedSearch\Indexer\is_grlist_set
‪bool is_grlist_set($phash_x)
Definition: Indexer.php:1612
‪TYPO3\CMS\IndexedSearch\Indexer\log_push
‪log_push($msg, $key)
Definition: Indexer.php:1972
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\isTableUsed
‪static bool isTableUsed(string $tableName)
Definition: IndexedSearchUtility.php:37
‪TYPO3\CMS\IndexedSearch\Indexer\$excludeSections
‪string $excludeSections
Definition: Indexer.php:55
‪TYPO3\CMS\IndexedSearch\Indexer\checkMtimeTstamp
‪int checkMtimeTstamp($mtime, $phash)
Definition: Indexer.php:1489
‪TYPO3\CMS\IndexedSearch\Indexer\$reasons
‪array $reasons
Definition: Indexer.php:42
‪TYPO3\CMS\IndexedSearch\Indexer\$timeTracker
‪TimeTracker $timeTracker
Definition: Indexer.php:196
‪TYPO3\CMS\IndexedSearch\Indexer\$enableMetaphoneSearch
‪bool $enableMetaphoneSearch
Definition: Indexer.php:169
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static string false getUrl(string $url)
Definition: GeneralUtility.php:1427
‪TYPO3\CMS\IndexedSearch\Indexer\$indexExternalUrl_content
‪string $indexExternalUrl_content
Definition: Indexer.php:157
‪TYPO3\CMS\IndexedSearch\Indexer\$contentParts
‪array $contentParts
Definition: Indexer.php:141
‪TYPO3\CMS\IndexedSearch\Indexer\$flagBitMask
‪int $flagBitMask
Definition: Indexer.php:192
‪TYPO3\CMS\IndexedSearch\Indexer\$internal_log
‪array $internal_log
Definition: Indexer.php:151
‪TYPO3\CMS\IndexedSearch\Indexer\fileContentParts
‪array fileContentParts($ext, $absFile)
Definition: Indexer.php:935
‪TYPO3\CMS\IndexedSearch\Indexer\log_setTSlogMessage
‪log_setTSlogMessage($msg, $logLevel=LogLevel::INFO)
Definition: Indexer.php:1991
‪TYPO3\CMS\IndexedSearch\Indexer\setExtHashes
‪array setExtHashes($file, $subinfo=[])
Definition: Indexer.php:1946
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_minAge
‪int $tstamp_minAge
Definition: Indexer.php:82
‪TYPO3\CMS\IndexedSearch\Indexer\checkExternalDocContentHash
‪bool checkExternalDocContentHash($hashGr, $content_md5h)
Definition: Indexer.php:1586
‪TYPO3\CMS\IndexedSearch\Indexer\init
‪init(array $configuration=null)
Definition: Indexer.php:225
‪TYPO3\CMS\IndexedSearch\Indexer\submitFilePage
‪submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
Definition: Indexer.php:1306
‪TYPO3\CMS\IndexedSearch\Indexer\$hash
‪array $hash
Definition: Indexer.php:129
‪TYPO3\CMS\IndexedSearch\Indexer\submitWords
‪submitWords($wordList, $phash)
Definition: Indexer.php:1852
‪TYPO3\CMS\IndexedSearch\Indexer\$conf
‪array $conf
Definition: Indexer.php:117
‪TYPO3\CMS\Core\Http\RequestFactory
Definition: RequestFactory.php:30
‪TYPO3\CMS\IndexedSearch\Indexer\updateTstamp
‪updateTstamp($phash, $mtime=0)
Definition: Indexer.php:1663
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\md5inthash
‪static int md5inthash(string $stringToHash)
Definition: IndexedSearchUtility.php:50
‪TYPO3\CMS\IndexedSearch\Indexer\processWordsInArrays
‪array processWordsInArrays($contentArr)
Definition: Indexer.php:986
‪TYPO3\CMS\IndexedSearch\Indexer\embracingTags
‪bool embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
Definition: Indexer.php:454
‪TYPO3\CMS\IndexedSearch\Indexer\extractLinks
‪extractLinks($content)
Definition: Indexer.php:511
‪TYPO3\CMS\IndexedSearch\Indexer\indexTypo3PageContent
‪indexTypo3PageContent()
Definition: Indexer.php:279
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:35
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_grlist
‪submitFile_grlist($hash)
Definition: Indexer.php:1383
‪TYPO3\CMS\Webhooks\Message\$url
‪identifier readonly UriInterface $url
Definition: LoginErrorOccurredMessage.php:36
‪TYPO3\CMS\IndexedSearch\Indexer\getRootLineFields
‪getRootLineFields(array &$fieldArray)
Definition: Indexer.php:1766
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_maxAge
‪int $tstamp_maxAge
Definition: Indexer.php:75
‪TYPO3\CMS\IndexedSearch\Indexer\getUrlHeaders
‪mixed getUrlHeaders($url)
Definition: Indexer.php:649
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:102
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:25
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:41
‪TYPO3\CMS\IndexedSearch\Indexer\submitPage
‪submitPage()
Definition: Indexer.php:1140
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneContent
‪string $metaphoneContent
Definition: Indexer.php:177
‪TYPO3\CMS\IndexedSearch\Indexer\$maxExternalFiles
‪int $maxExternalFiles
Definition: Indexer.php:88
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:24
‪TYPO3\CMS\IndexedSearch\Indexer\getHTMLcharset
‪string getHTMLcharset($content)
Definition: Indexer.php:411
‪TYPO3\CMS\IndexedSearch\Indexer\extractHyperLinks
‪array extractHyperLinks($html)
Definition: Indexer.php:565
‪TYPO3\CMS\IndexedSearch\Indexer\addSpacesToKeywordList
‪string addSpacesToKeywordList($keywordList)
Definition: Indexer.php:2005
‪TYPO3\CMS\IndexedSearch\Indexer\initializeExternalParsers
‪initializeExternalParsers()
Definition: Indexer.php:259
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:51
‪TYPO3\CMS\IndexedSearch\Indexer\__construct
‪__construct()
Definition: Indexer.php:201
‪TYPO3\CMS\IndexedSearch\Indexer\submit_section
‪submit_section($hash, $hash_t3)
Definition: Indexer.php:1247
‪TYPO3\CMS\IndexedSearch\Indexer\splitRegularContent
‪array splitRegularContent($content)
Definition: Indexer.php:952
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange(mixed $theInt, int $min, int $max=2000000000, int $defaultValue=0)
Definition: MathUtility.php:34
‪TYPO3\CMS\IndexedSearch\Indexer
Definition: Indexer.php:39
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:51
‪TYPO3\CMS\IndexedSearch\Indexer\metaphone
‪mixed metaphone($word, $returnRawMetaphoneValue=false)
Definition: Indexer.php:1113
‪TYPO3\CMS\Core\Utility\GeneralUtility\writeFile
‪static bool writeFile($file, $content, $changePermissions=false)
Definition: GeneralUtility.php:1452
‪TYPO3\CMS\IndexedSearch\Indexer\typoSearchTags
‪bool typoSearchTags(&$body)
Definition: Indexer.php:483
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromRelativeURL
‪string createLocalPathFromRelativeURL($sourcePath)
Definition: Indexer.php:759
‪TYPO3\CMS\IndexedSearch\Indexer\checkWordList
‪checkWordList($wordListArray)
Definition: Indexer.php:1786
‪TYPO3\CMS\Core\TimeTracker\TimeTracker
Definition: TimeTracker.php:32
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility
Definition: IndexedSearchUtility.php:28
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeBody
‪analyzeBody(&$retArr, $content)
Definition: Indexer.php:1078
‪TYPO3\CMS\IndexedSearch\Indexer\$freqRange
‪int $freqRange
Definition: Indexer.php:161
‪TYPO3\CMS\IndexedSearch\Indexer\extractBaseHref
‪string extractBaseHref($html)
Definition: Indexer.php:594
‪TYPO3\CMS\IndexedSearch\Indexer\setT3Hashes
‪setT3Hashes()
Definition: Indexer.php:1922
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultGrList
‪string $defaultGrList
Definition: Indexer.php:69
‪TYPO3\CMS\IndexedSearch\Indexer\readFileContent
‪array null readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
Definition: Indexer.php:918
‪TYPO3\CMS\IndexedSearch\Indexer\removePhashCollisions
‪array removePhashCollisions(array $wordList)
Definition: Indexer.php:2018
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\milliseconds
‪static milliseconds()
Definition: IndexedSearchUtility.php:172
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPath
‪string createLocalPath($sourcePath)
Definition: Indexer.php:671