‪TYPO3CMS  10.4
Indexer.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
32 
37 {
38 
42  public ‪$reasons = [
43  -1 => 'mtime matched the document, so no changes detected and no content updated',
44  -2 => 'The minimum age was not exceeded',
45  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
46  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
47  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
48  4 => 'Page has never been indexed (is not represented in the index_phash table).'
49  ];
50 
56  public ‪$excludeSections = 'script,style';
57 
63  public ‪$external_parsers = [];
64 
72  public ‪$defaultGrList = '0,-1';
73 
79  public ‪$tstamp_maxAge = 0;
80 
87  public ‪$tstamp_minAge = 0;
88 
94  public ‪$maxExternalFiles = 0;
95 
101  public ‪$forceIndexing = false;
102 
109  'title' => '',
110  'description' => '',
111  'keywords' => '',
112  'body' => ''
113  ];
114 
118  public ‪$wordcount = 0;
119 
123  public ‪$externalFileCounter = 0;
124 
128  public ‪$conf = [];
129 
135  public ‪$indexerConfig = [];
136 
142  public ‪$hash = [];
143 
149  public ‪$file_phash_arr = [];
150 
156  public ‪$contentParts = [];
157 
163  public ‪$content_md5h = '';
164 
168  public ‪$internal_log = [];
169 
175  public ‪$indexExternalUrl_content = '';
176 
180  public ‪$freqRange = 32000;
181 
185  public ‪$freqMax = 0.1;
186 
190  public ‪$enableMetaphoneSearch = false;
191 
196 
200  public ‪$metaphoneContent = '';
201 
207  public ‪$metaphoneObj;
208 
214  public ‪$lexerObj;
215 
219  public ‪$flagBitMask;
220 
224  protected ‪$timeTracker;
225 
229  public function ‪__construct()
230  {
231  $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
232  // Indexer configuration from Extension Manager interface
233  $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
234  $this->tstamp_minAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
235  $this->tstamp_maxAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
236  $this->maxExternalFiles = ‪MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
237  $this->flagBitMask = ‪MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
238  // Workaround: If the extension configuration was not updated yet, the value is not existing
239  $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
240  $this->storeMetaphoneInfoAsWords = !‪IndexedSearchUtility::isTableUsed('index_words') && ‪$this->enableMetaphoneSearch;
241  }
242 
243  /********************************
244  *
245  * Initialization
246  *
247  *******************************/
248 
253  public function ‪init(array $configuration = null)
254  {
255  if (is_array($configuration)) {
256  $this->conf = $configuration;
257  }
258  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
259  $this->‪setT3Hashes();
260  // Initialize external document parsers:
261  // Example configuration, see ext_localconf.php of this file!
262  if ($this->conf['index_externals']) {
264  }
265  // Initialize lexer (class that deconstructs the text into words):
266  $lexerObjectClassName = ‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
267  $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
268  $this->lexerObj->debug = $this->indexerConfig['debugMode'];
269  // Initialize metaphone hook:
270  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
271  if ($this->enableMetaphoneSearch && ‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
272  $this->metaphoneObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
273  $this->metaphoneObj->pObj = $this;
274  }
275  }
276 
283  public function ‪initializeExternalParsers()
284  {
285  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
286  $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
287  $this->external_parsers[$extension]->pObj = $this;
288  // Init parser and if it returns FALSE, unset its entry again:
289  if (!$this->external_parsers[$extension]->initParser($extension)) {
290  unset($this->external_parsers[$extension]);
291  }
292  }
293  }
294 
295  /********************************
296  *
297  * Indexing; TYPO3 pages (HTML content)
298  *
299  *******************************/
303  public function ‪indexTypo3PageContent()
304  {
305  $check = $this->‪checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
306  $is_grlist = $this->‪is_grlist_set($this->hash['phash']);
307  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
308  // Setting message:
309  if ($this->forceIndexing) {
310  $this->‪log_setTSlogMessage('Indexing needed, reason: Forced', 1);
311  } elseif ($check > 0) {
312  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
313  } else {
314  $this->‪log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
315  }
316  // Divide into title,keywords,description and body:
317  $this->‪log_push('Split content', '');
318  $this->contentParts = $this->‪splitHTMLContent($this->conf['content']);
319  if ($this->conf['indexedDocTitle']) {
320  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
321  }
322  $this->‪log_pull();
323  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
324  $this->content_md5h = ‪IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
325  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
326  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
327  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
328  $checkCHash = $this->‪checkContentHash();
329  if (!is_array($checkCHash) || $check === 1) {
331  $this->‪log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
332  $this->‪charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
333  $this->‪log_pull();
334  // Splitting words
335  $this->‪log_push('Extract words from content', '');
336  $splitInWords = $this->‪processWordsInArrays($this->contentParts);
337  $this->‪log_pull();
338  // Analyze the indexed words.
339  $this->‪log_push('Analyze the extracted words', '');
340  $indexArr = $this->‪indexAnalyze($splitInWords);
341  $this->‪log_pull();
342  // Submitting page (phash) record
343  $this->‪log_push('Submitting page', '');
344  $this->‪submitPage();
345  $this->‪log_pull();
346  // Check words and submit to word list if not there
347  $this->‪log_push('Check word list and submit words', '');
348  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
349  $this->‪checkWordList($indexArr);
350  $this->‪submitWords($indexArr, $this->hash['phash']);
351  }
352  $this->‪log_pull();
353  // Set parsetime
354  $this->‪updateParsetime($this->hash['phash'], ‪IndexedSearchUtility::milliseconds() - $Pstart);
355  // Checking external files if configured for.
356  $this->‪log_push('Checking external files', '');
357  if ($this->conf['index_externals']) {
358  $this->‪extractLinks($this->conf['content']);
359  }
360  $this->‪log_pull();
361  } else {
362  // Update the timestamp
363  $this->‪updateTstamp($this->hash['phash'], $this->conf['mtime']);
364  $this->‪updateSetId($this->hash['phash']);
365  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
366  $this->‪update_grlist($checkCHash['phash'], $this->hash['phash']);
367  $this->‪updateRootline();
368  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
369  }
370  } else {
371  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
372  }
373  }
374 
382  public function ‪splitHTMLContent($content)
383  {
384  // divide head from body ( u-ouh :) )
385  $contentArr = ‪$this->defaultContentArray;
386  $contentArr['body'] = stristr($content, '<body');
387  $headPart = substr($content, 0, -strlen($contentArr['body']));
388  // get title
389  $this->‪embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
390  $titleParts = explode(':', $contentArr['title'], 2);
391  $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
392  // get keywords and description metatags
393  if ($this->conf['index_metatags']) {
394  $meta = [];
395  $i = 0;
396  while ($this->‪embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
397  $i++;
398  }
399  // @todo The code below stops at first unset tag. Is that correct?
400  for ($i = 0; isset($meta[$i]); $i++) {
401  // decode HTML entities, meta tag content needs to be encoded later
402  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
403  if (stripos($meta[$i]['name'], 'keywords') !== false) {
404  $contentArr['keywords'] .= ',' . $this->‪addSpacesToKeywordList($meta[$i]['content']);
405  }
406  if (stripos($meta[$i]['name'], 'description') !== false) {
407  $contentArr['description'] .= ',' . $meta[$i]['content'];
408  }
409  }
410  }
411  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
412  $this->‪typoSearchTags($contentArr['body']);
413  // Get rid of unwanted sections (ie. scripting and style stuff) in body
414  $tagList = explode(',', $this->excludeSections);
415  foreach ($tagList as $tag) {
416  while ($this->‪embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
417  }
418  }
419  // remove tags, but first make sure we don't concatenate words by doing it
420  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
421  $contentArr['body'] = trim(strip_tags($contentArr['body']));
422  $contentArr['keywords'] = trim($contentArr['keywords']);
423  $contentArr['description'] = trim($contentArr['description']);
424  // Return array
425  return $contentArr;
426  }
427 
434  public function ‪getHTMLcharset($content)
435  {
436  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
437  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
438  return $reg2[1];
439  }
440  }
441 
442  return '';
443  }
444 
452  public function ‪convertHTMLToUtf8($content, $charset = '')
453  {
454  // Find charset:
455  $charset = $charset ?: $this->‪getHTMLcharset($content);
456  $charset = trim(strtolower($charset));
457  // Convert charset:
458  if ($charset && $charset !== 'utf-8') {
459  $content = mb_convert_encoding($content, 'utf-8', $charset);
460  }
461  // Convert entities, assuming document is now UTF-8:
462  return html_entity_decode($content);
463  }
464 
477  public function ‪embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
478  {
479  $endTag = '</' . $tagName . '>';
480  $startTag = '<' . $tagName;
481  // stristr used because we want a case-insensitive search for the tag.
482  $isTagInText = stristr($string, $startTag);
483  // if the tag was not found, return FALSE
484  if (!$isTagInText) {
485  return false;
486  }
487  [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
488  $afterTagInText = stristr($isTagInText, $endTag);
489  if ($afterTagInText) {
490  $stringBefore = substr($string, 0, (int)strpos(strtolower($string), strtolower($startTag)));
491  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
492  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
493  } else {
494  $tagContent = '';
495  $stringAfter = $isTagInText;
496  }
497  return true;
498  }
499 
506  public function ‪typoSearchTags(&$body)
507  {
508  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
509  $expBody = $expBody ?: [];
510  if (count($expBody) > 1) {
511  $body = '';
512  $prev = '';
513  foreach ($expBody as $val) {
514  $part = explode('-->', $val, 2);
515  if (trim($part[0]) === 'begin') {
516  $body .= $part[1];
517  $prev = '';
518  } elseif (trim($part[0]) === 'end') {
519  $body .= $prev;
520  } else {
521  $prev = $val;
522  }
523  }
524  return true;
525  }
526  return false;
527  }
528 
534  public function ‪extractLinks($content)
535  {
536  $crawler = null;
537  // Get links:
538  $list = $this->‪extractHyperLinks($content);
539  if ($this->indexerConfig['useCrawlerForExternalFiles'] && ‪ExtensionManagementUtility::isLoaded('crawler')) {
544  $crawler = GeneralUtility::makeInstance('tx_crawler_lib');
545  }
546  // Traverse links:
547  foreach ($list as $linkInfo) {
548  // Decode entities:
549  if ($linkInfo['localPath']) {
550  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
551  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
552  } else {
553  $linkSource = htmlspecialchars_decode($linkInfo['href']);
554  }
555  // Parse URL:
556  $qParts = parse_url($linkSource);
557  // Check for jumpurl (TYPO3 specific thing...)
558  if ($qParts['query'] && strpos($qParts['query'], 'jumpurl=') !== false) {
559  parse_str($qParts['query'], $getP);
560  $linkSource = $getP['jumpurl'];
561  $qParts = parse_url($linkSource);
562  }
563  if (!$linkInfo['localPath'] && $qParts['scheme']) {
564  if ($this->indexerConfig['indexExternalURLs']) {
565  // Index external URL (http or otherwise)
566  $this->‪indexExternalUrl($linkSource);
567  }
568  } elseif (!$qParts['query']) {
569  $linkSource = urldecode($linkSource);
570  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
571  $localFile = $linkSource;
572  } else {
573  $localFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $linkSource);
574  }
575  if ($localFile && @is_file($localFile)) {
576  // Index local file:
577  if ($linkInfo['localPath']) {
578  $fI = pathinfo($linkSource);
579  $ext = strtolower($fI['extension']);
580  if (is_object($crawler)) {
581  $params = [
582  'document' => $linkSource,
583  'alturl' => $linkInfo['href'],
584  'conf' => ‪$this->conf
585  ];
586  unset($params['conf']['content']);
587  $crawler->addQueueEntry_callBack(0, $params, CrawlerFilesHook::class, $this->conf['id']);
588  $this->‪log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
589  } else {
590  $this->‪indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
591  }
592  } else {
593  if (is_object($crawler)) {
594  $params = [
595  'document' => $linkSource,
596  'conf' => ‪$this->conf
597  ];
598  unset($params['conf']['content']);
599  $crawler->addQueueEntry_callBack(0, $params, CrawlerFilesHook::class, $this->conf['id']);
600  $this->‪log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
601  } else {
602  $this->‪indexRegularDocument($linkSource);
603  }
604  }
605  }
606  }
607  }
608  }
609 
617  public function ‪extractHyperLinks($html)
618  {
619  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
620  $htmlParts = $htmlParser->splitTags('a', $html);
621  $hyperLinksData = [];
622  foreach ($htmlParts as $index => $tagData) {
623  if ($index % 2 !== 0) {
624  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
625  $firstTagName = $htmlParser->getFirstTagName($tagData);
626  if (strtolower($firstTagName) === 'a') {
627  if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
628  $hyperLinksData[] = [
629  'tag' => $tagData,
630  'href' => $tagAttributes[0]['href'],
631  'localPath' => $this->‪createLocalPath(urldecode($tagAttributes[0]['href']))
632  ];
633  }
634  }
635  }
636  }
637  return $hyperLinksData;
638  }
639 
646  public function ‪extractBaseHref($html)
647  {
648  $href = '';
649  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
650  $htmlParts = $htmlParser->splitTags('base', $html);
651  foreach ($htmlParts as $index => $tagData) {
652  if ($index % 2 !== 0) {
653  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
654  $firstTagName = $htmlParser->getFirstTagName($tagData);
655  if (strtolower($firstTagName) === 'base') {
656  $href = $tagAttributes[0]['href'];
657  if ($href) {
658  break;
659  }
660  }
661  }
662  }
663  return $href;
664  }
665 
666  /******************************************
667  *
668  * Indexing; external URL
669  *
670  ******************************************/
677  public function ‪indexExternalUrl($externalUrl)
678  {
679  // Get headers:
680  $urlHeaders = $this->‪getUrlHeaders($externalUrl);
681  if (is_array($urlHeaders) && stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
682  $content = ($this->indexExternalUrl_content = ‪GeneralUtility::getUrl($externalUrl));
683  if ((string)$content !== '') {
684  // Create temporary file:
685  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
686  if ($tmpFile) {
687  ‪GeneralUtility::writeFile($tmpFile, $content);
688  // Index that file:
689  $this->‪indexRegularDocument($externalUrl, true, $tmpFile, 'html');
690  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
691  unlink($tmpFile);
692  }
693  }
694  }
695  }
696 
703  public function ‪getUrlHeaders($url)
704  {
705  try {
706  $response = GeneralUtility::makeInstance(RequestFactory::class)->request($url, 'HEAD');
707  $headers = $response->getHeaders();
708  $retVal = [];
709  foreach ($headers as $key => $value) {
710  $retVal[$key] = implode('', $value);
711  }
712  return $retVal;
713  } catch (\Exception $e) {
714  // fail silently if the HTTP request failed
715  return false;
716  }
717  }
718 
725  protected function ‪createLocalPath($sourcePath)
726  {
727  $localPath = '';
728  $pathFunctions = [
729  'createLocalPathUsingAbsRefPrefix',
730  'createLocalPathUsingDomainURL',
731  'createLocalPathFromAbsoluteURL',
732  'createLocalPathFromRelativeURL'
733  ];
734  foreach ($pathFunctions as $functionName) {
735  $localPath = $this->{$functionName}($sourcePath);
736  if ($localPath != '') {
737  break;
738  }
739  }
740  return $localPath;
741  }
742 
749  protected function ‪createLocalPathUsingDomainURL($sourcePath)
750  {
751  $localPath = '';
752  $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
753  $baseURLLength = strlen($baseURL);
754  if (strpos($sourcePath, $baseURL) === 0) {
755  $sourcePath = substr($sourcePath, $baseURLLength);
756  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
757  if (!self::isAllowedLocalFile($localPath)) {
758  $localPath = '';
759  }
760  }
761  return $localPath;
762  }
763 
771  protected function ‪createLocalPathUsingAbsRefPrefix($sourcePath)
772  {
773  $localPath = '';
774  if (isset(‪$GLOBALS['TSFE']) && ‪$GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
775  $absRefPrefix = ‪$GLOBALS['TSFE']->config['config']['absRefPrefix'];
776  $absRefPrefixLength = strlen($absRefPrefix);
777  if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
778  $sourcePath = substr($sourcePath, $absRefPrefixLength);
779  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
780  if (!self::isAllowedLocalFile($localPath)) {
781  $localPath = '';
782  }
783  }
784  }
785  return $localPath;
786  }
787 
795  protected function ‪createLocalPathFromAbsoluteURL($sourcePath)
796  {
797  $localPath = '';
798  if ($sourcePath[0] === '/') {
799  $sourcePath = substr($sourcePath, 1);
800  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
801  if (!self::isAllowedLocalFile($localPath)) {
802  $localPath = '';
803  }
804  }
805  return $localPath;
806  }
807 
814  protected function ‪createLocalPathFromRelativeURL($sourcePath)
815  {
816  $localPath = '';
817  if (self::isRelativeURL($sourcePath)) {
818  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
819  if (!self::isAllowedLocalFile($localPath)) {
820  $localPath = '';
821  }
822  }
823  return $localPath;
824  }
825 
832  protected static function ‪isRelativeURL($url)
833  {
834  $urlParts = @parse_url($url);
835  return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
836  }
837 
844  protected static function ‪isAllowedLocalFile($filePath)
845  {
846  $filePath = GeneralUtility::resolveBackPath($filePath);
847  $insideWebPath = strpos($filePath, ‪Environment::getPublicPath()) === 0;
848  $isFile = is_file($filePath);
849  return $insideWebPath && $isFile;
850  }
851 
852  /******************************************
853  *
854  * Indexing; external files (PDF, DOC, etc)
855  *
856  ******************************************/
865  public function ‪indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
866  {
867  // Init
868  $fI = pathinfo($file);
869  $ext = $altExtension ?: strtolower($fI['extension']);
870  // Create abs-path:
871  if (!$contentTmpFile) {
872  if (!GeneralUtility::isAbsPath($file)) {
873  // Relative, prepend public web path:
874  $absFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $file);
875  } else {
876  // Absolute, pass-through:
877  $absFile = $file;
878  }
879  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
880  } else {
881  $absFile = $contentTmpFile;
882  }
883  // Indexing the document:
884  if ($absFile && @is_file($absFile)) {
885  if ($this->external_parsers[$ext]) {
886  $fileInfo = stat($absFile);
887  $cParts = $this->‪fileContentParts($ext, $absFile);
888  foreach ($cParts as $cPKey) {
889  $this->internal_log = [];
890  $this->‪log_push('Index: ' . str_replace('.', '_', ‪PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
892  $subinfo = ['key' => $cPKey];
893  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
894  $phash_arr = ($this->file_phash_arr = $this->‪setExtHashes($file, $subinfo));
895  $check = $this->‪checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
896  if ($check > 0 || $force) {
897  if ($check > 0) {
898  $this->‪log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
899  } else {
900  $this->‪log_setTSlogMessage('Indexing forced by flag', 1);
901  }
902  // Check external file counter:
903  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
904  // Divide into title,keywords,description and body:
905  $this->‪log_push('Split content', '');
906  ‪$contentParts = $this->‪readFileContent($ext, $absFile, $cPKey);
907  $this->‪log_pull();
908  if (is_array(‪$contentParts)) {
909  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
911  if ($this->‪checkExternalDocContentHash($phash_arr['phash_grouping'], ‪$content_md5h) || $force) {
912  // Increment counter:
913  $this->externalFileCounter++;
914  // Splitting words
915  $this->‪log_push('Extract words from content', '');
916  $splitInWords = $this->‪processWordsInArrays(‪$contentParts);
917  $this->‪log_pull();
918  // Analyze the indexed words.
919  $this->‪log_push('Analyze the extracted words', '');
920  $indexArr = $this->‪indexAnalyze($splitInWords);
921  $this->‪log_pull();
922  // Submitting page (phash) record
923  $this->‪log_push('Submitting page', '');
924  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
925  $this->‪submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], ‪$content_md5h, ‪$contentParts);
926  $this->‪log_pull();
927  // Check words and submit to word list if not there
928  $this->‪log_push('Check word list and submit words', '');
929  if (‪IndexedSearchUtility::isTableUsed('index_words')) {
930  $this->‪checkWordList($indexArr);
931  $this->‪submitWords($indexArr, $phash_arr['phash']);
932  }
933  $this->‪log_pull();
934  // Set parsetime
935  $this->‪updateParsetime($phash_arr['phash'], ‪IndexedSearchUtility::milliseconds() - $Pstart);
936  } else {
937  // Update the timestamp
938  $this->‪updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
939  $this->‪log_setTSlogMessage('Indexing not needed, the contentHash, ' . ‪$content_md5h . ', has not changed. Timestamp updated.');
940  }
941  } else {
942  $this->‪log_setTSlogMessage('Could not index file! Unsupported extension.');
943  }
944  } else {
945  $this->‪log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
946  }
947  } else {
948  $this->‪log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
949  }
950  // Checking and setting sections:
951  $this->‪submitFile_section($phash_arr['phash']);
952  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
953  $this->‪log_pull();
954  }
955  } else {
956  $this->‪log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
957  }
958  } else {
959  $this->‪log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
960  }
961  }
962 
972  public function ‪readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
973  {
974  $contentArray = null;
975  // Consult relevant external document parser:
976  if (is_object($this->external_parsers[$fileExtension])) {
977  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
978  }
979  return $contentArray;
980  }
981 
989  public function ‪fileContentParts($ext, $absFile)
990  {
991  $cParts = [0];
992  // Consult relevant external document parser:
993  if (is_object($this->external_parsers[$ext])) {
994  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
995  }
996  return $cParts;
997  }
998 
1006  public function ‪splitRegularContent($content)
1007  {
1008  $contentArr = ‪$this->defaultContentArray;
1009  $contentArr['body'] = $content;
1010  return $contentArr;
1011  }
1012 
1013  /**********************************
1014  *
1015  * Analysing content, Extracting words
1016  *
1017  **********************************/
1024  public function ‪charsetEntity2utf8(&$contentArr, $charset)
1025  {
1026  // Convert charset if necessary
1027  foreach ($contentArr as $key => $value) {
1028  if ((string)$contentArr[$key] !== '') {
1029  if ($charset !== 'utf-8') {
1030  $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1031  }
1032  // decode all numeric / html-entities in the string to real characters:
1033  $contentArr[$key] = html_entity_decode($contentArr[$key]);
1034  }
1035  }
1036  }
1037 
1044  public function ‪processWordsInArrays($contentArr)
1045  {
1046  // split all parts to words
1047  foreach ($contentArr as $key => $value) {
1048  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1049  }
1050  // For title, keywords, and description we don't want duplicates:
1051  $contentArr['title'] = array_unique($contentArr['title']);
1052  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1053  $contentArr['description'] = array_unique($contentArr['description']);
1054  // Return modified array:
1055  return $contentArr;
1056  }
1057 
1064  public function ‪bodyDescription($contentArr)
1065  {
1066  $bodyDescription = '';
1067  // Setting description
1068  $maxL = ‪MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1069  if ($maxL) {
1070  $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1071  // Shorten the string. If the database has the wrong character set
1072  // set the string is probably truncated again. mb_strcut can not be
1073  // used here because it's not part of the fallback package
1074  // symfony/polyfill-mbstring in case of the missing ext:mbstring.
1075  $bodyDescription = \mb_substr($bodyDescription, 0, $maxL, 'utf-8');
1076  }
1077  return $bodyDescription;
1078  }
1079 
1086  public function ‪indexAnalyze($content)
1087  {
1088  $indexArr = [];
1089  $this->‪analyzeHeaderinfo($indexArr, $content, 'title', 7);
1090  $this->‪analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1091  $this->‪analyzeHeaderinfo($indexArr, $content, 'description', 5);
1092  $this->‪analyzeBody($indexArr, $content);
1093  return $indexArr;
1094  }
1095 
1104  public function ‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1105  {
1106  foreach ($content[$key] as $val) {
1107  $val = mb_substr($val, 0, 60);
1108  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1109  if (!isset($retArr[$val])) {
1110  // Word ID (wid)
1111  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1112  // Metaphone value is also 60 only chars long
1113  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1114  $retArr[$val]['metaphone'] = $metaphone;
1115  }
1116  // Build metaphone fulltext string (can be used for fulltext indexing)
1117  if ($this->storeMetaphoneInfoAsWords) {
1118  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1119  }
1120  // Priority used for flagBitMask feature (see extension configuration)
1121  $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | 2 ** $offset;
1122  // Increase number of occurrences
1123  $retArr[$val]['count']++;
1124  $this->wordcount++;
1125  }
1126  }
1127 
1134  public function ‪analyzeBody(&$retArr, $content)
1135  {
1136  foreach ($content['body'] as $key => $val) {
1137  $val = substr($val, 0, 60);
1138  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1139  if (!isset($retArr[$val])) {
1140  // First occurrence (used for ranking results)
1141  $retArr[$val]['first'] = $key;
1142  // Word ID (wid)
1143  $retArr[$val]['hash'] = ‪IndexedSearchUtility::md5inthash($val);
1144  // Metaphone value is also only 60 chars long
1145  $metaphone = $this->enableMetaphoneSearch ? substr($this->‪metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1146  $retArr[$val]['metaphone'] = $metaphone;
1147  }
1148  // Build metaphone fulltext string (can be used for fulltext indexing)
1149  if ($this->storeMetaphoneInfoAsWords) {
1150  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1151  }
1152  // Increase number of occurrences
1153  $retArr[$val]['count']++;
1154  $this->wordcount++;
1155  }
1156  }
1157 
1165  public function ‪metaphone($word, $returnRawMetaphoneValue = false)
1166  {
1167  if (is_object($this->metaphoneObj)) {
1168  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1169  } else {
1170  // Use native PHP function instead of advanced doubleMetaphone class
1171  $metaphoneRawValue = ‪metaphone($word);
1172  }
1173  if ($returnRawMetaphoneValue) {
1174  $result = $metaphoneRawValue;
1175  } elseif ($metaphoneRawValue !== '') {
1176  // Create hash and return integer
1177  $result = ‪IndexedSearchUtility::md5inthash($metaphoneRawValue);
1178  } else {
1179  $result = 0;
1180  }
1181  return $result;
1182  }
1183 
1184  /********************************
1185  *
1186  * SQL; TYPO3 Pages
1187  *
1188  *******************************/
1192  public function ‪submitPage()
1193  {
1194  // Remove any current data for this phash:
1195  $this->‪removeOldIndexedPages($this->hash['phash']);
1196  // setting new phash_row
1197  ‪$fields = [
1198  'phash' => $this->hash['phash'],
1199  'phash_grouping' => $this->hash['phash_grouping'],
1200  'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1201  'contentHash' => $this->content_md5h,
1202  'data_page_id' => $this->conf['id'],
1203  'data_page_type' => $this->conf['type'],
1204  'data_page_mp' => $this->conf['MP'],
1205  'gr_list' => $this->conf['gr_list'],
1206  'item_type' => 0,
1207  // TYPO3 page
1208  'item_title' => $this->contentParts['title'],
1209  'item_description' => $this->‪bodyDescription($this->contentParts),
1210  'item_mtime' => (int)$this->conf['mtime'],
1211  'item_size' => strlen($this->conf['content']),
1212  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1213  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1214  'item_crdate' => $this->conf['crdate'],
1215  // Creation date of page
1216  'sys_language_uid' => $this->conf['sys_language_uid'],
1217  // Sys language uid of the page. Should reflect which language it DOES actually display!
1218  'externalUrl' => 0,
1219  'recordUid' => (int)$this->conf['recordUid'],
1220  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1221  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1222  ];
1223  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1224  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1225  ->getConnectionForTable('index_phash');
1226  $connection->insert(
1227  'index_phash',
1228  ‪$fields
1229  );
1230  }
1231  // PROCESSING index_section
1232  $this->‪submit_section($this->hash['phash'], $this->hash['phash']);
1233  // PROCESSING index_grlist
1234  $this->‪submit_grlist($this->hash['phash'], $this->hash['phash']);
1235  // PROCESSING index_fulltext
1236  ‪$fields = [
1237  'phash' => $this->hash['phash'],
1238  'fulltextdata' => implode(' ', $this->contentParts),
1239  'metaphonedata' => ‪$this->metaphoneContent
1240  ];
1241  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1242  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1243  }
1244  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1245  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1246  ->getConnectionForTable('index_fulltext');
1247  $connection->insert('index_fulltext', ‪$fields);
1248  }
1249  // PROCESSING index_debug
1250  if ($this->indexerConfig['debugMode']) {
1251  ‪$fields = [
1252  'phash' => $this->hash['phash'],
1253  'debuginfo' => json_encode([
1254  'external_parsers initialized' => array_keys($this->external_parsers),
1255  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1256  'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1257  'logs' => $this->internal_log,
1258  'lexer' => $this->lexerObj->debugString
1259  ])
1260  ];
1261  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1262  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1263  ->getConnectionForTable('index_debug');
1264  $connection->insert('index_debug', ‪$fields);
1265  }
1266  }
1267  }
1268 
1276  public function ‪submit_grlist(‪$hash, $phash_x)
1277  {
1278  // Setting the gr_list record
1279  ‪$fields = [
1280  'phash' => ‪$hash,
1281  'phash_x' => $phash_x,
1282  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1283  'gr_list' => $this->conf['gr_list']
1284  ];
1285  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1286  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1287  ->getConnectionForTable('index_grlist');
1288  $connection->insert('index_grlist', ‪$fields);
1289  }
1290  }
1299  public function ‪submit_section(‪$hash, $hash_t3)
1300  {
1301  ‪$fields = [
1302  'phash' => ‪$hash,
1303  'phash_t3' => $hash_t3,
1304  'page_id' => (int)$this->conf['id']
1305  ];
1307  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1308  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1309  ->getConnectionForTable('index_section');
1310  $connection->insert('index_section', ‪$fields);
1311  }
1312  }
1313 
1319  public function ‪removeOldIndexedPages($phash)
1320  {
1321  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1322  // there can be nothing else than 1-1 relations here.
1323  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1324  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1325  foreach ($tableArray as $table) {
1327  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1328  }
1329  }
1331  // Removing all index_section records with hash_t3 set to this hash (this includes such
1332  // records set for external media on the page as well!). The re-insert of these records
1333  // are done in indexRegularDocument($file).
1334  if (‪IndexedSearchUtility::isTableUsed('index_section')) {
1335  $connectionPool->getConnectionForTable('index_section')
1336  ->delete('index_section', ['phash_t3' => (int)$phash]);
1337  }
1338  }
1339 
1340  /********************************
1341  *
1342  * SQL; External media
1343  *
1344  *******************************/
1358  public function ‪submitFilePage(‪$hash, $file, $subinfo, $ext, $mtime, $ctime, $size, ‪$content_md5h, ‪$contentParts)
1359  {
1360  // Find item Type:
1361  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1362  $storeItemType = $storeItemType ?: $ext;
1363  // Remove any current data for this phash:
1364  $this->‪removeOldIndexedFiles(‪$hash['phash']);
1365  // Split filename:
1366  $fileParts = parse_url($file);
1367  // Setting new
1368  ‪$fields = [
1369  'phash' => ‪$hash['phash'],
1370  'phash_grouping' => ‪$hash['phash_grouping'],
1371  'static_page_arguments' => json_encode($subinfo),
1372  'contentHash' => ‪$content_md5h,
1373  'data_filename' => $file,
1374  'item_type' => $storeItemType,
1375  'item_title' => trim(‪$contentParts['title']) ?: ‪PathUtility::basename($file),
1376  'item_description' => $this->‪bodyDescription(‪$contentParts),
1377  'item_mtime' => $mtime,
1378  'item_size' => $size,
1379  'item_crdate' => $ctime,
1380  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1381  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1382  'gr_list' => $this->conf['gr_list'],
1383  'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1384  'recordUid' => (int)$this->conf['recordUid'],
1385  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1386  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1387  'sys_language_uid' => (int)$this->conf['sys_language_uid']
1388  ];
1389  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1390  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1391  ->getConnectionForTable('index_phash');
1392  $connection->insert(
1393  'index_phash',
1394  ‪$fields
1395  );
1396  }
1397  // PROCESSING index_fulltext
1398  ‪$fields = [
1399  'phash' => ‪$hash['phash'],
1400  'fulltextdata' => implode(' ', ‪$contentParts),
1401  'metaphonedata' => ‪$this->metaphoneContent
1402  ];
1403  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1404  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1405  }
1406  if (‪IndexedSearchUtility::isTableUsed('index_fulltext')) {
1407  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1408  ->getConnectionForTable('index_fulltext');
1409  $connection->insert('index_fulltext', ‪$fields);
1410  }
1411  // PROCESSING index_debug
1412  if ($this->indexerConfig['debugMode']) {
1413  ‪$fields = [
1414  'phash' => ‪$hash['phash'],
1415  'debuginfo' => json_encode([
1416  'static_page_arguments' => $subinfo,
1417  'contentParts' => array_merge(‪$contentParts, ['body' => substr(‪$contentParts['body'], 0, 1000)]),
1418  'logs' => $this->internal_log,
1419  'lexer' => $this->lexerObj->debugString
1420  ])
1421  ];
1422  if (‪IndexedSearchUtility::isTableUsed('index_debug')) {
1423  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1424  ->getConnectionForTable('index_debug');
1425  $connection->insert('index_debug', ‪$fields);
1426  }
1427  }
1428  }
1429 
1435  public function ‪submitFile_grlist(‪$hash)
1436  {
1437  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1438  if (!‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1439  return;
1440  }
1441 
1442  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1443  ->getQueryBuilderForTable('index_grlist');
1444  $count = (int)$queryBuilder->count('*')
1445  ->from('index_grlist')
1446  ->where(
1447  $queryBuilder->expr()->eq(
1448  'phash',
1449  $queryBuilder->createNamedParameter(‪$hash, \PDO::PARAM_INT)
1450  ),
1451  $queryBuilder->expr()->orX(
1452  $queryBuilder->expr()->eq(
1453  'hash_gr_list',
1454  $queryBuilder->createNamedParameter(
1455  ‪IndexedSearchUtility::md5inthash($this->defaultGrList),
1456  \PDO::PARAM_INT
1457  )
1458  ),
1459  $queryBuilder->expr()->eq(
1460  'hash_gr_list',
1461  $queryBuilder->createNamedParameter(
1462  ‪IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1463  \PDO::PARAM_INT
1464  )
1465  )
1466  )
1467  )
1468  ->execute()
1469  ->fetchColumn();
1470 
1471  if ($count === 0) {
1473  }
1474  }
1475 
1481  public function ‪submitFile_section(‪$hash)
1482  {
1483  // Testing if there is already a section
1484  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1485  return;
1486  }
1488  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1489  ->getQueryBuilderForTable('index_section');
1490  $count = (int)$queryBuilder->count('phash')
1491  ->from('index_section')
1492  ->where(
1493  $queryBuilder->expr()->eq(
1494  'phash',
1495  $queryBuilder->createNamedParameter(‪$hash, \PDO::PARAM_INT)
1496  ),
1497  $queryBuilder->expr()->eq(
1498  'page_id',
1499  $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1500  )
1501  )
1502  ->execute()
1503  ->fetchColumn();
1504 
1505  if ($count === 0) {
1506  $this->‪submit_section(‪$hash, $this->hash['phash']);
1507  }
1508  }
1509 
1515  public function ‪removeOldIndexedFiles($phash)
1516  {
1517  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1518  // Removing old registrations for tables.
1519  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1520  foreach ($tableArray as $table) {
1521  if (!‪IndexedSearchUtility::isTableUsed($table)) {
1522  continue;
1523  }
1524  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1525  }
1526  }
1527 
1528  /********************************
1529  *
1530  * SQL Helper functions
1531  *
1532  *******************************/
1541  public function ‪checkMtimeTstamp($mtime, $phash)
1542  {
1543  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1544  // Not indexed (not in index_phash)
1545  $result = 4;
1546  } else {
1547  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1548  ->select(
1549  ['item_mtime', 'tstamp'],
1550  'index_phash',
1551  ['phash' => (int)$phash],
1552  [],
1553  [],
1554  1
1555  )
1556  ->fetch();
1557  // If there was an indexing of the page...:
1558  if (!empty($row)) {
1559  if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < ‪$GLOBALS['EXEC_TIME']) {
1560  // If max age is exceeded, index the page
1561  // The configured max-age was exceeded for the document and thus it's indexed.
1562  $result = 1;
1563  } else {
1564  if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < ‪$GLOBALS['EXEC_TIME']) {
1565  // if minAge is not set or if minAge is exceeded, consider at mtime
1566  if ($mtime) {
1567  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1568  if ($row['item_mtime'] != $mtime) {
1569  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1570  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1571  $result = 2;
1572  } else {
1573  // mtime matched the document, so no changes detected and no content updated
1574  $result = -1;
1575  if ($this->tstamp_maxAge) {
1576  $this->‪log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - ‪$GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1577  } else {
1578  $this->‪updateTstamp($phash);
1579  $this->‪log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1580  }
1581  }
1582  } else {
1583  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1584  $result = 3;
1585  }
1586  } else {
1587  // The minimum age was not exceeded
1588  $result = -2;
1589  }
1590  }
1591  } else {
1592  // Page has never been indexed (is not represented in the index_phash table).
1593  $result = 4;
1594  }
1595  }
1596  return $result;
1597  }
1598 
1604  public function ‪checkContentHash()
1605  {
1606  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1607  $result = true;
1608  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1609  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1610  ->select(
1611  ['phash'],
1612  'index_phash',
1613  [
1614  'phash_grouping' => (int)$this->hash['phash_grouping'],
1615  'contentHash' => (int)$this->content_md5h
1616  ],
1617  [],
1618  [],
1619  1
1620  )
1621  ->fetch();
1622 
1623  if (!empty($row)) {
1624  $result = $row;
1625  }
1626  }
1627  return $result;
1628  }
1629 
1638  public function ‪checkExternalDocContentHash($hashGr, ‪$content_md5h)
1639  {
1640  $result = true;
1641  if (‪IndexedSearchUtility::isTableUsed('index_phash')) {
1642  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1643  ->getConnectionForTable('index_phash')
1644  ->count(
1645  '*',
1646  'index_phash',
1647  [
1648  'phash_grouping' => (int)$hashGr,
1649  'contentHash' => (int)‪$content_md5h
1650  ]
1651  );
1652 
1653  $result = $count === 0;
1654  }
1655  return $result;
1656  }
1657 
1664  public function ‪is_grlist_set($phash_x)
1665  {
1666  $result = false;
1667  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1668  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1669  ->getConnectionForTable('index_grlist')
1670  ->count(
1671  'phash_x',
1672  'index_grlist',
1673  ['phash_x' => (int)$phash_x]
1674  );
1675 
1676  $result = $count > 0;
1677  }
1678  return $result;
1679  }
1680 
1688  public function ‪update_grlist($phash, $phash_x)
1689  {
1690  if (‪IndexedSearchUtility::isTableUsed('index_grlist')) {
1691  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1692  ->getConnectionForTable('index_grlist')
1693  ->count(
1694  'phash',
1695  'index_grlist',
1696  [
1697  'phash' => (int)$phash,
1698  'hash_gr_list' => ‪IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1699  ]
1700  );
1701 
1702  if ($count === 0) {
1703  $this->‪submit_grlist($phash, $phash_x);
1704  $this->‪log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1705  }
1706  }
1707  }
1708 
1715  public function ‪updateTstamp($phash, $mtime = 0)
1716  {
1718  return;
1719  }
1720 
1721  $updateFields = [
1722  'tstamp' => ‪$GLOBALS['EXEC_TIME']
1723  ];
1724 
1725  if ($mtime) {
1726  $updateFields['item_mtime'] = (int)$mtime;
1727  }
1728 
1729  GeneralUtility::makeInstance(ConnectionPool::class)
1730  ->getConnectionForTable('index_phash')
1731  ->update(
1732  'index_phash',
1733  $updateFields,
1734  [
1735  'phash' => (int)$phash
1736  ]
1737  );
1738  }
1739 
1745  public function ‪updateSetId($phash)
1746  {
1747  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1748  return;
1749  }
1750 
1751  GeneralUtility::makeInstance(ConnectionPool::class)
1752  ->getConnectionForTable('index_phash')
1753  ->update(
1754  'index_phash',
1755  [
1756  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1757  ],
1758  [
1759  'phash' => (int)$phash
1760  ]
1761  );
1762  }
1763 
1770  public function ‪updateParsetime($phash, $parsetime)
1771  {
1772  if (!‪IndexedSearchUtility::isTableUsed('index_phash')) {
1773  return;
1774  }
1775 
1776  GeneralUtility::makeInstance(ConnectionPool::class)
1777  ->getConnectionForTable('index_phash')
1778  ->update(
1779  'index_phash',
1780  [
1781  'parsetime' => (int)$parsetime
1782  ],
1783  [
1784  'phash' => (int)$phash
1785  ]
1786  );
1787  }
1788 
1792  public function ‪updateRootline()
1793  {
1794  if (!‪IndexedSearchUtility::isTableUsed('index_section')) {
1795  return;
1796  }
1797 
1798  $updateFields = [];
1799  $this->‪getRootLineFields($updateFields);
1800 
1801  GeneralUtility::makeInstance(ConnectionPool::class)
1802  ->getConnectionForTable('index_section')
1803  ->update(
1804  'index_section',
1805  $updateFields,
1806  [
1807  'page_id' => (int)$this->conf['id']
1808  ]
1809  );
1810  }
1811 
1818  public function ‪getRootLineFields(array &$fieldArray)
1819  {
1820  $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1821  $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1822  $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1823  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
1824  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1825  }
1826  }
1827 
1828  /********************************
1829  *
1830  * SQL; Submitting words
1831  *
1832  *******************************/
1838  public function ‪checkWordList($wordListArray)
1839  {
1840  if (!‪IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
1841  return;
1842  }
1843 
1844  $wordListArrayCount = count($wordListArray);
1845  $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
1846 
1847  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
1848  $count = (int)$queryBuilder->count('baseword')
1849  ->from('index_words')
1850  ->where(
1851  $queryBuilder->expr()->in(
1852  'wid',
1853  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1854  )
1855  )
1856  ->execute()
1857  ->fetchColumn();
1858 
1859  if ($count !== $wordListArrayCount) {
1860  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
1861  $queryBuilder = $connection->createQueryBuilder();
1862 
1863  $result = $queryBuilder->select('baseword')
1864  ->from('index_words')
1865  ->where(
1866  $queryBuilder->expr()->in(
1867  'wid',
1868  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1869  )
1870  )
1871  ->execute();
1873  $this->‪log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
1874  while ($row = $result->fetch()) {
1875  unset($wordListArray[$row['baseword']]);
1876  }
1877 
1878  foreach ($wordListArray as $key => $val) {
1879  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
1880  // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
1881  // this is not a problem.
1882  $connection->insert(
1883  'index_words',
1884  [
1885  'wid' => $val['hash'],
1886  'baseword' => $key,
1887  'metaphone' => $val['metaphone']
1888  ]
1889  );
1890  }
1891  }
1892  }
1893 
1900  public function ‪submitWords($wordList, $phash)
1901  {
1902  if (!‪IndexedSearchUtility::isTableUsed('index_rel')) {
1903  return;
1904  }
1905  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1906  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
1907  $result = $queryBuilder->select('wid')
1908  ->from('index_words')
1909  ->where(
1910  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
1911  )
1912  ->groupBy('wid')
1913  ->execute();
1914 
1915  $stopWords = [];
1916  while ($row = $result->fetch()) {
1917  $stopWords[$row['wid']] = $row;
1918  }
1919 
1920  $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
1921 
1922  ‪$fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1923  $rows = [];
1924  foreach ($wordList as $val) {
1925  if (isset($stopWords[$val['hash']])) {
1926  continue;
1927  }
1928  $rows[] = [
1929  (int)$phash,
1930  (int)$val['hash'],
1931  (int)$val['count'],
1932  (int)$val['first'],
1933  $this->‪freqMap($val['count'] / $this->wordcount),
1934  $val['cmp'] & ‪$this->flagBitMask
1935  ];
1936  }
1937 
1938  if (!empty($rows)) {
1939  $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, ‪$fields);
1940  }
1941  }
1950  public function ‪freqMap($freq)
1951  {
1952  $mapFactor = $this->freqMax * 100 * ‪$this->freqRange;
1953  if ($freq <= 1) {
1954  $newFreq = $freq * $mapFactor;
1955  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
1956  } else {
1957  $newFreq = $freq / $mapFactor;
1958  }
1959  return (int)$newFreq;
1960  }
1961 
1962  /********************************
1963  *
1964  * Hashing
1965  *
1966  *******************************/
1970  public function ‪setT3Hashes()
1971  {
1972  // Set main array:
1973  $hArray = [
1974  'id' => (int)$this->conf['id'],
1975  'type' => (int)$this->conf['type'],
1976  'sys_lang' => (int)$this->conf['sys_language_uid'],
1977  'MP' => (string)$this->conf['MP'],
1978  'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1979  ];
1980  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1981  $this->hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1982  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1983  $hArray['gr_list'] = (string)$this->conf['gr_list'];
1984  $this->hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
1985  }
1986 
1994  public function ‪setExtHashes($file, $subinfo = [])
1995  {
1996  // Set main array:
1997  ‪$hash = [];
1998  $hArray = [
1999  'file' => $file
2000  ];
2001  // Set grouping hash:
2002  ‪$hash['phash_grouping'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
2003  // Add subinfo
2004  $hArray['subinfo'] = $subinfo;
2005  ‪$hash['phash'] = ‪IndexedSearchUtility::md5inthash(serialize($hArray));
2006  return ‪$hash;
2007  }
2008 
2009  /*********************************
2010  *
2011  * Internal logging functions
2012  *
2013  *********************************/
2020  public function ‪log_push($msg, $key)
2021  {
2022  $this->timeTracker->push($msg, $key);
2023  }
2024 
2028  public function ‪log_pull()
2029  {
2030  $this->timeTracker->pull();
2031  }
2032 
2039  public function ‪log_setTSlogMessage($msg, $errorNum = 0)
2040  {
2041  $this->timeTracker->setTSlogMessage($msg, $errorNum);
2042  $this->internal_log[] = $msg;
2043  }
2044 
2053  protected function ‪addSpacesToKeywordList($keywordList)
2054  {
2055  $keywords = ‪GeneralUtility::trimExplode(',', $keywordList);
2056  return ' ' . implode(', ', $keywords) . ' ';
2057  }
2058 }
‪TYPO3\CMS\IndexedSearch\Indexer\splitHTMLContent
‪array splitHTMLContent($content)
Definition: Indexer.php:354
‪TYPO3\CMS\IndexedSearch\Indexer\updateParsetime
‪updateParsetime($phash, $parsetime)
Definition: Indexer.php:1742
‪TYPO3\CMS\IndexedSearch\Indexer\$lexerObj
‪TYPO3 CMS IndexedSearch Lexer $lexerObj
Definition: Indexer.php:188
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\md5inthash
‪static int md5inthash($stringToHash)
Definition: IndexedSearchUtility.php:48
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:24
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingDomainURL
‪string createLocalPathUsingDomainURL($sourcePath)
Definition: Indexer.php:721
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static string getPublicPath()
Definition: Environment.php:180
‪TYPO3\CMS\IndexedSearch\Indexer\submit_grlist
‪submit_grlist($hash, $phash_x)
Definition: Indexer.php:1248
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultContentArray
‪array $defaultContentArray
Definition: Indexer.php:99
‪TYPO3\CMS\Core\Html\HtmlParser
Definition: HtmlParser.php:27
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathUsingAbsRefPrefix
‪string createLocalPathUsingAbsRefPrefix($sourcePath)
Definition: Indexer.php:743
‪TYPO3\CMS\IndexedSearch\Indexer\indexRegularDocument
‪indexRegularDocument($file, $force=false, $contentTmpFile='', $altExtension='')
Definition: Indexer.php:837
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:45
‪TYPO3\CMS\IndexedSearch\Indexer\$externalFileCounter
‪int $externalFileCounter
Definition: Indexer.php:112
‪TYPO3\CMS\IndexedSearch\Indexer\updateSetId
‪updateSetId($phash)
Definition: Indexer.php:1717
‪TYPO3\CMS\IndexedSearch\Indexer\isAllowedLocalFile
‪static bool isAllowedLocalFile($filePath)
Definition: Indexer.php:816
‪TYPO3\CMS\IndexedSearch\Indexer\$external_parsers
‪array $external_parsers
Definition: Indexer.php:60
‪TYPO3\CMS\IndexedSearch\Indexer\indexAnalyze
‪array indexAnalyze($content)
Definition: Indexer.php:1058
‪TYPO3\CMS\IndexedSearch\Indexer\$indexerConfig
‪array $indexerConfig
Definition: Indexer.php:122
‪TYPO3\CMS\IndexedSearch\Indexer\$flagBitMask
‪bool $flagBitMask
Definition: Indexer.php:192
‪TYPO3\CMS\IndexedSearch\Indexer\isRelativeURL
‪static bool isRelativeURL($url)
Definition: Indexer.php:804
‪TYPO3\CMS\IndexedSearch\Indexer\$storeMetaphoneInfoAsWords
‪bool $storeMetaphoneInfoAsWords
Definition: Indexer.php:172
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:32
‪TYPO3\CMS\IndexedSearch\Indexer\indexExternalUrl
‪indexExternalUrl($externalUrl)
Definition: Indexer.php:649
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static mixed getUrl($url, $includeHeader=0, $requestHeaders=null, &$report=null)
Definition: GeneralUtility.php:1748
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeHeaderinfo
‪analyzeHeaderinfo(&$retArr, $content, $key, $offset)
Definition: Indexer.php:1076
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedPages
‪removeOldIndexedPages($phash)
Definition: Indexer.php:1291
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\isTableUsed
‪static bool isTableUsed($tableName)
Definition: IndexedSearchUtility.php:35
‪TYPO3\CMS\IndexedSearch\Indexer\log_pull
‪log_pull()
Definition: Indexer.php:2000
‪TYPO3\CMS\IndexedSearch\Indexer\$freqMax
‪float $freqMax
Definition: Indexer.php:164
‪TYPO3\CMS\IndexedSearch\Indexer\checkContentHash
‪mixed checkContentHash()
Definition: Indexer.php:1576
‪TYPO3\CMS\IndexedSearch\Indexer\convertHTMLToUtf8
‪string convertHTMLToUtf8($content, $charset='')
Definition: Indexer.php:424
‪$fields
‪$fields
Definition: pages.php:5
‪TYPO3\CMS\IndexedSearch\Indexer\bodyDescription
‪string bodyDescription($contentArr)
Definition: Indexer.php:1036
‪TYPO3\CMS\IndexedSearch\Indexer\update_grlist
‪update_grlist($phash, $phash_x)
Definition: Indexer.php:1660
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static string basename($path)
Definition: PathUtility.php:165
‪TYPO3\CMS\IndexedSearch\Indexer\freqMap
‪int freqMap($freq)
Definition: Indexer.php:1922
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromAbsoluteURL
‪string createLocalPathFromAbsoluteURL($sourcePath)
Definition: Indexer.php:767
‪TYPO3\CMS\IndexedSearch\Indexer\$forceIndexing
‪bool $forceIndexing
Definition: Indexer.php:93
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_section
‪submitFile_section($hash)
Definition: Indexer.php:1453
‪TYPO3\CMS\IndexedSearch\Indexer\$file_phash_arr
‪array $file_phash_arr
Definition: Indexer.php:134
‪TYPO3\CMS\IndexedSearch\Indexer\$wordcount
‪int $wordcount
Definition: Indexer.php:108
‪TYPO3\CMS\IndexedSearch\Indexer\removeOldIndexedFiles
‪removeOldIndexedFiles($phash)
Definition: Indexer.php:1487
‪TYPO3\CMS\Core\Utility\ExtensionManagementUtility
Definition: ExtensionManagementUtility.php:43
‪TYPO3\CMS\IndexedSearch\Indexer\updateRootline
‪updateRootline()
Definition: Indexer.php:1764
‪TYPO3\CMS\IndexedSearch\Indexer\is_grlist_set
‪bool is_grlist_set($phash_x)
Definition: Indexer.php:1636
‪TYPO3\CMS\IndexedSearch\Indexer\log_push
‪log_push($msg, $key)
Definition: Indexer.php:1992
‪TYPO3\CMS\IndexedSearch\Indexer\$excludeSections
‪string $excludeSections
Definition: Indexer.php:54
‪TYPO3\CMS\IndexedSearch\Indexer\checkMtimeTstamp
‪int checkMtimeTstamp($mtime, $phash)
Definition: Indexer.php:1513
‪TYPO3\CMS\IndexedSearch\Indexer\$reasons
‪array $reasons
Definition: Indexer.php:41
‪TYPO3\CMS\IndexedSearch\Indexer\$timeTracker
‪TimeTracker $timeTracker
Definition: Indexer.php:196
‪TYPO3\CMS\IndexedSearch\Indexer\$enableMetaphoneSearch
‪bool $enableMetaphoneSearch
Definition: Indexer.php:168
‪TYPO3\CMS\IndexedSearch\Indexer\charsetEntity2utf8
‪charsetEntity2utf8(&$contentArr, $charset)
Definition: Indexer.php:996
‪TYPO3\CMS\IndexedSearch\Indexer\$indexExternalUrl_content
‪string $indexExternalUrl_content
Definition: Indexer.php:156
‪TYPO3\CMS\IndexedSearch\Indexer\$contentParts
‪array $contentParts
Definition: Indexer.php:140
‪TYPO3\CMS\IndexedSearch\Indexer\$internal_log
‪array $internal_log
Definition: Indexer.php:150
‪TYPO3\CMS\IndexedSearch\Indexer\fileContentParts
‪array fileContentParts($ext, $absFile)
Definition: Indexer.php:961
‪TYPO3\CMS\IndexedSearch\Indexer\setExtHashes
‪array setExtHashes($file, $subinfo=[])
Definition: Indexer.php:1966
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_minAge
‪int $tstamp_minAge
Definition: Indexer.php:81
‪TYPO3\CMS\IndexedSearch\Indexer\checkExternalDocContentHash
‪bool checkExternalDocContentHash($hashGr, $content_md5h)
Definition: Indexer.php:1610
‪TYPO3\CMS\IndexedSearch\Indexer\init
‪init(array $configuration=null)
Definition: Indexer.php:225
‪TYPO3\CMS\IndexedSearch\Indexer\readFileContent
‪array readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
Definition: Indexer.php:944
‪TYPO3\CMS\IndexedSearch\Indexer\submitFilePage
‪submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
Definition: Indexer.php:1330
‪TYPO3\CMS\IndexedSearch\Indexer\$hash
‪array $hash
Definition: Indexer.php:128
‪TYPO3\CMS\IndexedSearch\Indexer\submitWords
‪submitWords($wordList, $phash)
Definition: Indexer.php:1872
‪TYPO3\CMS\IndexedSearch\Indexer\$conf
‪array $conf
Definition: Indexer.php:116
‪TYPO3\CMS\Core\Http\RequestFactory
Definition: RequestFactory.php:31
‪TYPO3\CMS\IndexedSearch\Indexer\updateTstamp
‪updateTstamp($phash, $mtime=0)
Definition: Indexer.php:1687
‪TYPO3\CMS\IndexedSearch\Indexer\processWordsInArrays
‪array processWordsInArrays($contentArr)
Definition: Indexer.php:1016
‪TYPO3\CMS\IndexedSearch\Indexer\embracingTags
‪bool embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
Definition: Indexer.php:449
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static string[] trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
Definition: GeneralUtility.php:1059
‪TYPO3\CMS\IndexedSearch\Indexer\extractLinks
‪extractLinks($content)
Definition: Indexer.php:506
‪TYPO3\CMS\IndexedSearch\Indexer\indexTypo3PageContent
‪indexTypo3PageContent()
Definition: Indexer.php:275
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:36
‪TYPO3\CMS\IndexedSearch\Indexer\submitFile_grlist
‪submitFile_grlist($hash)
Definition: Indexer.php:1407
‪TYPO3\CMS\IndexedSearch\Indexer\getRootLineFields
‪getRootLineFields(array &$fieldArray)
Definition: Indexer.php:1790
‪TYPO3\CMS\IndexedSearch\Indexer\$tstamp_maxAge
‪int $tstamp_maxAge
Definition: Indexer.php:74
‪TYPO3\CMS\IndexedSearch\Indexer\getUrlHeaders
‪mixed getUrlHeaders($url)
Definition: Indexer.php:675
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneObj
‪TYPO3 CMS IndexedSearch Utility DoubleMetaPhoneUtility $metaphoneObj
Definition: Indexer.php:182
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:98
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\milliseconds
‪static int milliseconds()
Definition: IndexedSearchUtility.php:175
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:5
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:40
‪TYPO3\CMS\IndexedSearch\Indexer\submitPage
‪submitPage()
Definition: Indexer.php:1164
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerFilesHook
Definition: CrawlerFilesHook.php:28
‪TYPO3\CMS\IndexedSearch\Indexer\$metaphoneContent
‪string $metaphoneContent
Definition: Indexer.php:176
‪TYPO3\CMS\IndexedSearch\Indexer\$maxExternalFiles
‪int $maxExternalFiles
Definition: Indexer.php:87
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:22
‪TYPO3\CMS\IndexedSearch\Indexer\log_setTSlogMessage
‪log_setTSlogMessage($msg, $errorNum=0)
Definition: Indexer.php:2011
‪TYPO3\CMS\IndexedSearch\Indexer\getHTMLcharset
‪string getHTMLcharset($content)
Definition: Indexer.php:406
‪TYPO3\CMS\IndexedSearch\Indexer\extractHyperLinks
‪array extractHyperLinks($html)
Definition: Indexer.php:589
‪TYPO3\CMS\IndexedSearch\Indexer\addSpacesToKeywordList
‪string addSpacesToKeywordList($keywordList)
Definition: Indexer.php:2025
‪TYPO3\CMS\IndexedSearch\Indexer\initializeExternalParsers
‪initializeExternalParsers()
Definition: Indexer.php:255
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:46
‪TYPO3\CMS\IndexedSearch\Indexer\__construct
‪__construct()
Definition: Indexer.php:201
‪TYPO3\CMS\IndexedSearch\Indexer\submit_section
‪submit_section($hash, $hash_t3)
Definition: Indexer.php:1271
‪TYPO3\CMS\IndexedSearch\Indexer\splitRegularContent
‪array splitRegularContent($content)
Definition: Indexer.php:978
‪TYPO3\CMS\IndexedSearch\Indexer
Definition: Indexer.php:37
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:46
‪TYPO3\CMS\IndexedSearch\Indexer\metaphone
‪mixed metaphone($word, $returnRawMetaphoneValue=false)
Definition: Indexer.php:1137
‪TYPO3\CMS\Core\Utility\GeneralUtility\writeFile
‪static bool writeFile($file, $content, $changePermissions=false)
Definition: GeneralUtility.php:1836
‪TYPO3\CMS\IndexedSearch\Indexer\typoSearchTags
‪bool typoSearchTags(&$body)
Definition: Indexer.php:478
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPathFromRelativeURL
‪string createLocalPathFromRelativeURL($sourcePath)
Definition: Indexer.php:786
‪TYPO3\CMS\Core\Utility\ExtensionManagementUtility\isLoaded
‪static bool isLoaded($key)
Definition: ExtensionManagementUtility.php:114
‪TYPO3\CMS\IndexedSearch\Indexer\checkWordList
‪checkWordList($wordListArray)
Definition: Indexer.php:1810
‪TYPO3\CMS\Core\TimeTracker\TimeTracker
Definition: TimeTracker.php:30
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility
Definition: IndexedSearchUtility.php:26
‪TYPO3\CMS\IndexedSearch\Indexer\analyzeBody
‪analyzeBody(&$retArr, $content)
Definition: Indexer.php:1106
‪TYPO3\CMS\IndexedSearch\Indexer\$freqRange
‪int $freqRange
Definition: Indexer.php:160
‪TYPO3\CMS\IndexedSearch\Indexer\extractBaseHref
‪string extractBaseHref($html)
Definition: Indexer.php:618
‪TYPO3\CMS\IndexedSearch\Indexer\setT3Hashes
‪setT3Hashes()
Definition: Indexer.php:1942
‪TYPO3\CMS\IndexedSearch\Indexer\$content_md5h
‪string $content_md5h
Definition: Indexer.php:146
‪TYPO3\CMS\IndexedSearch\Indexer\$defaultGrList
‪string $defaultGrList
Definition: Indexer.php:68
‪TYPO3\CMS\IndexedSearch\Indexer\createLocalPath
‪string createLocalPath($sourcePath)
Definition: Indexer.php:697