‪TYPO3CMS  ‪main
Indexer.php
Go to the documentation of this file.
1 <?php
2 
3 declare(strict_types=1);
4 
5 /*
6  * This file is part of the TYPO3 CMS project.
7  *
8  * It is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU General Public License, either version 2
10  * of the License, or any later version.
11  *
12  * For the full copyright and license information, please read the
13  * LICENSE.txt file that was distributed with this source code.
14  *
15  * The TYPO3 project - inspiring people to share!
16  */
17 
19 
20 use Psr\Log\LogLevel;
35 
41 class Indexer
42 {
46  public string $excludeSections = 'script,style';
47 
51  public array $external_parsers = [];
52 
56  public int $tstamp_maxAge = 0;
57 
62  public int $tstamp_minAge = 0;
63 
67  public int $maxExternalFiles = 0;
68 
72  public bool $forceIndexing = false;
73 
74  public array $defaultIndexingDataPayload = [
75  'title' => '',
76  'description' => '',
77  'keywords' => '',
78  'body' => '',
79  ];
80 
81  public int $wordcount = 0;
82  public int $externalFileCounter = 0;
83  public array $conf = [];
84 
88  public array $indexerConfig = [];
89 
93  public array $hash = [];
94 
98  public array $file_phash_arr = [];
99 
100  public IndexingDataAsString $indexingDataStringDto;
101 
105  public string $content_md5h = '';
106  public array $internal_log = [];
107  public string $indexExternalUrl_content = '';
108  public int $freqRange = 32000;
109  public float $freqMax = 0.1;
110  public Lexer $lexerObj;
111  public int $flagBitMask;
112  protected TimeTracker $timeTracker;
113 
114  public function __construct()
115  {
116  $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
117  // Indexer configuration from Extension Manager interface
118  $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
119  $this->tstamp_minAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
120  $this->tstamp_maxAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
121  $this->maxExternalFiles = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxExternalFiles'] ?? 5), 0, 1000);
122  $this->flagBitMask = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['flagBitMask'] ?? 0), 0, 255);
123 
124  // Initialize lexer (class that deconstructs the text into words):
125  $lexerObjectClassName = (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?? null)
126  ? ‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer']
127  : Lexer::class;
129  $lexer = GeneralUtility::makeInstance($lexerObjectClassName);
130  $this->lexerObj = $lexer;
131  $this->lexerObj->debug = (bool)($this->indexerConfig['debugMode'] ?? false);
132  }
133 
137  public function init(array $configuration = null): void
138  {
139  if (is_array($configuration)) {
140  $this->conf = $configuration;
141  }
142  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
143  $this->setT3Hashes();
144  // Initialize external document parsers:
145  // Example configuration, see ext_localconf.php of this file!
146  if ($this->conf['index_externals']) {
147  $this->initializeExternalParsers();
148  }
149  }
150 
151  public function initializeExternalParsers(): void
152  {
153  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
154  $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
155  $this->external_parsers[$extension]->pObj = $this;
156  // Init parser and if it returns FALSE, unset its entry again:
157  if (!$this->external_parsers[$extension]->initParser($extension)) {
158  unset($this->external_parsers[$extension]);
159  }
160  }
161  }
162 
163  /********************************
164  *
165  * Indexing; TYPO3 pages (HTML content)
166  *
167  *******************************/
171  public function indexTypo3PageContent(): void
172  {
173  $indexStatus = $this->getIndexStatus($this->conf['mtime'], $this->hash['phash']);
174  $reindexingRequired = $indexStatus->reindexRequired();
175  $is_grlist = $this->is_grlist_set($this->hash['phash']);
176  if ($reindexingRequired || !$is_grlist || $this->forceIndexing) {
177  // Setting message:
178  if ($this->forceIndexing) {
179  $this->log_setTSlogMessage('Indexing needed, reason: Forced', LogLevel::NOTICE);
180  } elseif ($reindexingRequired) {
181  $this->log_setTSlogMessage('Indexing needed, reason: ' . $indexStatus->reason(), LogLevel::NOTICE);
182  } else {
183  $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', LogLevel::NOTICE);
184  }
185  // Divide into title,keywords,description and body:
186  $this->timeTracker->push('Split content');
187  $this->indexingDataStringDto = $this->splitHTMLContent($this->conf['content']);
188  if ($this->conf['indexedDocTitle']) {
189  $this->indexingDataStringDto->title = $this->conf['indexedDocTitle'];
190  }
191  $this->timeTracker->pull();
192  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
193  $this->content_md5h = md5(implode('', $this->indexingDataStringDto->toArray()));
194  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
195  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
196  // This will also prevent pages from being indexed if a fe_users has logged in, and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
197  $checkCHash = $this->checkContentHash();
198  if (!is_array($checkCHash) || $reindexingRequired) {
199  $Pstart = $this->milliseconds();
200  $this->timeTracker->push('Converting entities of content');
201  $this->charsetEntity2utf8($this->indexingDataStringDto);
202  $this->timeTracker->pull();
203 
204  // Splitting words
205  $this->timeTracker->push('Extract words from content');
206  $splitInWords = $this->processWordsInArrays($this->indexingDataStringDto);
207  $this->timeTracker->pull();
208 
209  // Analyze the indexed words.
210  $this->timeTracker->push('Analyze the extracted words');
211  $indexArr = $this->indexAnalyze($splitInWords);
212  $this->timeTracker->pull();
213 
214  // Submitting page (phash) record
215  $this->timeTracker->push('Submitting page');
216  $this->submitPage();
217  $this->timeTracker->pull();
218 
219  // Check words and submit to word list if not there
220  $this->timeTracker->push('Check word list and submit words');
222  $indexArr = $this->removePhashCollisions($indexArr);
223  $this->checkWordList($indexArr);
224  $this->submitWords($indexArr, $this->hash['phash']);
225  }
226  $this->timeTracker->pull();
227 
228  // Set parse time
229  $this->updateParsetime($this->hash['phash'], $this->milliseconds() - $Pstart);
230 
231  // Checking external files if configured for.
232  if ($this->conf['index_externals']) {
233  $this->timeTracker->push('Checking external files', '');
234  $this->extractLinks($this->conf['content']);
235  $this->timeTracker->pull();
236  }
237  } else {
238  // Update the timestamp
239  $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
240  $this->updateSetId($this->hash['phash']);
241 
242  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
243  $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
244  $this->updateRootline();
245  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
246  }
247  } else {
248  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $indexStatus->reason());
249  }
250  }
251 
257  public function splitHTMLContent(string $content): IndexingDataAsString
258  {
259  $indexingDataDto = ‪IndexingDataAsString::fromArray($this->defaultIndexingDataPayload);
260  $indexingDataDto->body = stristr($content, '<body') ?: '';
261  $headPart = substr($content, 0, -strlen($indexingDataDto->body));
262  // get title
263  $this->embracingTags($headPart, 'TITLE', $indexingDataDto->title, $dummy2, $dummy);
264  $titleParts = explode(':', $indexingDataDto->title, 2);
265  $indexingDataDto->title = trim($titleParts[1] ?? $titleParts[0]);
266  // get keywords and description meta tags
267  if ($this->conf['index_metatags']) {
268  $meta = [];
269  $i = 0;
270  while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
271  $i++;
272  }
273  // @todo The code below stops at first unset tag. Is that correct?
274  for ($i = 0; isset($meta[$i]); $i++) {
275  // decode HTML entities, meta tag content needs to be encoded later
276  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
277  if (stripos(($meta[$i]['name'] ?? ''), 'keywords') !== false) {
278  $indexingDataDto->keywords .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
279  }
280  if (stripos(($meta[$i]['name'] ?? ''), 'description') !== false) {
281  $indexingDataDto->description .= ',' . $meta[$i]['content'];
282  }
283  }
284  }
285  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
286  $this->typoSearchTags($indexingDataDto->body);
287  // Get rid of unwanted sections (i.e. scripting and style stuff) in body
288  $tagList = explode(',', $this->excludeSections);
289  foreach ($tagList as $tag) {
290  while ($this->embracingTags($indexingDataDto->body, $tag, $dummy, $indexingDataDto->body, $dummy2)) {
291  }
292  }
293  // remove tags, but first make sure we don't concatenate words by doing it
294  $indexingDataDto->body = str_replace('<', ' <', $indexingDataDto->body);
295  $indexingDataDto->body = trim(strip_tags($indexingDataDto->body));
296  $indexingDataDto->keywords = trim($indexingDataDto->keywords);
297  $indexingDataDto->description = trim($indexingDataDto->description);
298 
299  return $indexingDataDto;
300  }
301 
305  public function getHTMLcharset(string $content): string
306  {
307  // @todo: Use \DOMDocument and DOMXpath
308  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)
309  && preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)
310  ) {
311  return $reg2[1];
312  }
313 
314  return '';
315  }
316 
320  public function convertHTMLToUtf8(string $content, string $charset = ''): string
321  {
322  // Find charset
323  $charset = $charset ?: $this->getHTMLcharset($content);
324  $charset = strtolower(trim($charset));
325  // Convert charset
326  if ($charset && $charset !== 'utf-8') {
327  $content = mb_convert_encoding($content, 'utf-8', $charset);
328  }
329  // Convert entities, assuming document is now UTF-8
330  return html_entity_decode($content);
331  }
332 
344  public function embracingTags(string $string, string $tagName, ?string &$tagContent, ?string &$stringAfter, ?string &$paramList): bool
345  {
346  $endTag = '</' . $tagName . '>';
347  $startTag = '<' . $tagName;
348  // stristr used because we want a case-insensitive search for the tag.
349  $isTagInText = stristr($string, $startTag);
350  // if the tag was not found, return FALSE
351  if (!$isTagInText) {
352  return false;
353  }
354  [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
355  $afterTagInText = stristr($isTagInText, $endTag);
356  if ($afterTagInText) {
357  $stringBefore = substr($string, 0, (int)stripos($string, $startTag));
358  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
359  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
360  } else {
361  $tagContent = '';
362  $stringAfter = $isTagInText;
363  }
364  return true;
365  }
366 
373  public function typoSearchTags(string &$body): bool
374  {
375  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
376  $expBody = $expBody ?: [];
377  if (count($expBody) > 1) {
378  $body = '';
379  $prev = '';
380  foreach ($expBody as $val) {
381  $part = explode('-->', $val, 2);
382  if (trim($part[0]) === 'begin') {
383  $body .= $part[1];
384  $prev = '';
385  } elseif (trim($part[0]) === 'end') {
386  $body .= $prev;
387  } else {
388  $prev = $val;
389  }
390  }
391  return true;
392  }
393  return false;
394  }
395 
399  public function extractLinks(string $content): void
400  {
401  // Get links:
402  $list = $this->extractHyperLinks($content);
403  // Traverse links:
404  foreach ($list as $linkInfo) {
405  // Decode entities:
406  if ($linkInfo['localPath']) {
407  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
408  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
409  } else {
410  $linkSource = htmlspecialchars_decode($linkInfo['href']);
411  }
412  // Parse URL:
413  $qParts = parse_url($linkSource);
414  // Check for jumpurl (TYPO3 specific thing...)
415  if (($qParts['query'] ?? false) && str_contains($qParts['query'] ?? '', 'jumpurl=')) {
416  parse_str($qParts['query'], $getP);
417  $linkSource = $getP['jumpurl'];
418  $qParts = parse_url($linkSource);
419  }
420  if (!$linkInfo['localPath'] && ($qParts['scheme'] ?? false)) {
421  if ($this->indexerConfig['indexExternalURLs']) {
422  // Index external URL (http or otherwise)
423  $this->indexExternalUrl($linkSource);
424  }
425  } elseif (!($qParts['query'] ?? false)) {
426  $linkSource = urldecode($linkSource);
427  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
428  $localFile = $linkSource;
429  } else {
430  $localFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $linkSource);
431  }
432  if ($localFile && @is_file($localFile)) {
433  // Index local file:
434  if ($linkInfo['localPath']) {
435  $fI = pathinfo($linkSource);
436  $ext = strtolower($fI['extension']);
437  $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
438  } else {
439  $this->indexRegularDocument($linkSource);
440  }
441  }
442  }
443  }
444  }
445 
451  public function extractHyperLinks(string $html): array
452  {
453  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
454  $htmlParts = $htmlParser->splitTags('a', $html);
455  $hyperLinksData = [];
456  foreach ($htmlParts as $index => $tagData) {
457  if ($index % 2 !== 0) {
458  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
459  $firstTagName = $htmlParser->getFirstTagName($tagData);
460  if (strtolower($firstTagName) === 'a') {
461  if (!empty($tagAttributes[0]['href']) && !str_starts_with($tagAttributes[0]['href'], '#')) {
462  $hyperLinksData[] = [
463  'tag' => $tagData,
464  'href' => $tagAttributes[0]['href'],
465  'localPath' => $this->createLocalPath(urldecode($tagAttributes[0]['href'])),
466  ];
467  }
468  }
469  }
470  }
471  return $hyperLinksData;
472  }
473 
477  public function extractBaseHref(string $html): string
478  {
479  $href = '';
480  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
481  $htmlParts = $htmlParser->splitTags('base', $html);
482  foreach ($htmlParts as $index => $tagData) {
483  if ($index % 2 !== 0) {
484  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
485  $firstTagName = $htmlParser->getFirstTagName($tagData);
486  if (strtolower($firstTagName) === 'base') {
487  $href = $tagAttributes[0]['href'];
488  if ($href) {
489  break;
490  }
491  }
492  }
493  }
494  return $href;
495  }
496 
497  /******************************************
498  *
499  * Indexing; external URL
500  *
501  ******************************************/
507  public function indexExternalUrl(string $externalUrl): void
508  {
509  // Get headers:
510  $urlHeaders = $this->getUrlHeaders($externalUrl);
511  if (is_array($urlHeaders) && stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
512  $content = ($this->indexExternalUrl_content = ‪GeneralUtility::getUrl($externalUrl));
513  if ((string)$content !== '') {
514  // Create temporary file:
515  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
516  ‪GeneralUtility::writeFile($tmpFile, $content);
517  // Index that file:
518  $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
519  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
520  unlink($tmpFile);
521  }
522  }
523  }
524 
531  public function getUrlHeaders(string ‪$url): array|false
532  {
533  try {
534  $response = GeneralUtility::makeInstance(RequestFactory::class)->request(‪$url, 'HEAD');
535  $headers = $response->getHeaders();
536  $retVal = [];
537  foreach ($headers as $key => $value) {
538  $retVal[$key] = implode('', $value);
539  }
540  return $retVal;
541  } catch (\Exception $e) {
542  // fail silently if the HTTP request failed
543  return false;
544  }
545  }
546 
552  protected function createLocalPath(string $sourcePath): string
553  {
554  $localPath = $this->createLocalPathUsingAbsRefPrefix($sourcePath);
555  if ($localPath !== '') {
556  return $localPath;
557  }
558  $localPath = $this->createLocalPathUsingDomainURL($sourcePath);
559  if ($localPath !== '') {
560  return $localPath;
561  }
562  $localPath = $this->createLocalPathFromAbsoluteURL($sourcePath);
563  if ($localPath !== '') {
564  return $localPath;
565  }
566  return $this->createLocalPathFromRelativeURL($sourcePath);
567  }
568 
572  protected function createLocalPathUsingDomainURL(string $sourcePath): string
573  {
574  $localPath = '';
575  $baseURL = ‪$GLOBALS['TYPO3_REQUEST']->getAttribute('normalizedParams')->getSiteUrl();
576  $baseURLLength = strlen($baseURL);
577  if (str_starts_with($sourcePath, $baseURL)) {
578  $sourcePath = substr($sourcePath, $baseURLLength);
579  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
580  if (!self::isAllowedLocalFile($localPath)) {
581  $localPath = '';
582  }
583  }
584  return $localPath;
585  }
586 
591  protected function createLocalPathUsingAbsRefPrefix(string $sourcePath): string
592  {
593  $localPath = '';
594  $request = ‪$GLOBALS['TYPO3_REQUEST'];
595  $frontendTypoScriptConfigArray = $request->getAttribute('frontend.typoscript')?->getConfigArray();
596  if ($frontendTypoScriptConfigArray) {
597  $absRefPrefix = $frontendTypoScriptConfigArray['absRefPrefix'] ?? '';
598  $absRefPrefixLength = strlen($absRefPrefix);
599  if ($absRefPrefixLength > 0 && str_starts_with($sourcePath, $absRefPrefix)) {
600  $sourcePath = substr($sourcePath, $absRefPrefixLength);
601  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
602  if (!self::isAllowedLocalFile($localPath)) {
603  $localPath = '';
604  }
605  }
606  }
607  return $localPath;
608  }
609 
613  protected function createLocalPathFromAbsoluteURL(string $sourcePath): string
614  {
615  $localPath = '';
616  if (str_starts_with($sourcePath, '/')) {
617  $sourcePath = substr($sourcePath, 1);
618  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
619  if (!self::isAllowedLocalFile($localPath)) {
620  $localPath = '';
621  }
622  }
623  return $localPath;
624  }
625 
629  protected function createLocalPathFromRelativeURL(string $sourcePath): string
630  {
631  $localPath = '';
632  if (self::isRelativeURL($sourcePath)) {
633  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
634  if (!self::isAllowedLocalFile($localPath)) {
635  $localPath = '';
636  }
637  }
638  return $localPath;
639  }
640 
644  protected static function isRelativeURL(string ‪$url): bool
645  {
646  $urlParts = @parse_url(‪$url);
647  return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && !str_starts_with(($urlParts['path'][0] ?? ''), '/');
648  }
649 
653  protected static function isAllowedLocalFile(string $filePath): bool
654  {
655  $filePath = GeneralUtility::resolveBackPath($filePath);
656  $insideWebPath = str_starts_with($filePath, ‪Environment::getPublicPath());
657  $isFile = is_file($filePath);
658  return $insideWebPath && $isFile;
659  }
660 
661  /******************************************
662  *
663  * Indexing; external files (PDF, DOC, etc)
664  *
665  ******************************************/
674  public function indexRegularDocument(string $file, bool $force = false, string $contentTmpFile = '', string $altExtension = ''): void
675  {
676  $fI = pathinfo($file);
677  $ext = $altExtension ?: strtolower($fI['extension']);
678  // Create abs-path
679  if (!$contentTmpFile) {
680  if (!‪PathUtility::isAbsolutePath($file)) {
681  // Relative, prepend public web path:
682  $absFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $file);
683  } else {
684  // Absolute, pass-through:
685  $absFile = $file;
686  }
687  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
688  } else {
689  $absFile = $contentTmpFile;
690  }
691  // Indexing the document:
692  if ($absFile && @is_file($absFile)) {
693  if ($this->external_parsers[$ext] ?? false) {
694  $fileInfo = stat($absFile);
695  $cParts = $this->fileContentParts($ext, $absFile);
696  foreach ($cParts as $cPKey) {
697  $this->internal_log = [];
698  $this->timeTracker->push('Index: ' . str_replace('.', '_', ‪PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''));
699  $Pstart = $this->milliseconds();
700  $subinfo = ['key' => $cPKey];
701  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
702  $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
703  $indexStatus = $this->getIndexStatus($fileInfo['mtime'], $phash_arr['phash']);
704  $reindexingRequired = $indexStatus->reindexRequired();
705  if ($reindexingRequired || $force) {
706  if ($reindexingRequired) {
707  $this->log_setTSlogMessage('Indexing needed, reason: ' . $indexStatus->reason(), LogLevel::NOTICE);
708  } else {
709  $this->log_setTSlogMessage('Indexing forced by flag', LogLevel::NOTICE);
710  }
711  // Check external file counter:
712  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
713  // Divide into title,keywords,description and body:
714  $this->timeTracker->push('Split content');
715  $indexingDataDtoAsString = $this->readFileContent($ext, $absFile, $cPKey);
716  $this->timeTracker->pull();
717  if ($indexingDataDtoAsString !== null) {
718  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
719  $content_md5h = md5(implode('', $indexingDataDtoAsString->toArray()));
720  if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
721  // Increment counter:
722  $this->externalFileCounter++;
723 
724  // Splitting words
725  $this->timeTracker->push('Extract words from content');
726  $splitInWords = $this->processWordsInArrays($indexingDataDtoAsString);
727  $this->timeTracker->pull();
728 
729  // Analyze the indexed words.
730  $this->timeTracker->push('Analyze the extracted words');
731  $indexArr = $this->indexAnalyze($splitInWords);
732  $this->timeTracker->pull();
733 
734  // Submitting page (phash) record
735  $this->timeTracker->push('Submitting page');
736 
737  // Unfortunately the original creation time cannot be determined, therefore we fall back to the modification date
738  $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $indexingDataDtoAsString);
739  $this->timeTracker->pull();
740 
741  // Check words and submit to word list if not there
742  $this->timeTracker->push('Check word list and submit words');
744  $indexArr = $this->removePhashCollisions($indexArr);
745  $this->checkWordList($indexArr);
746  $this->submitWords($indexArr, $phash_arr['phash']);
747  }
748  $this->timeTracker->pull();
749 
750  // Set parsetime
751  $this->updateParsetime($phash_arr['phash'], $this->milliseconds() - $Pstart);
752  } else {
753  // Update the timestamp
754  $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
755  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
756  }
757  } else {
758  $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
759  }
760  } else {
761  $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
762  }
763  } else {
764  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $indexStatus->reason());
765  }
766  // Checking and setting sections:
767  $this->submitFile_section($phash_arr['phash']);
768  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
769  $this->timeTracker->pull();
770  }
771  } else {
772  $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
773  }
774  } else {
775  $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
776  }
777  }
778 
787  public function readFileContent(string $fileExtension, string $absoluteFileName, string|int $sectionPointer): ?IndexingDataAsString
788  {
789  $indexingDataDto = null;
790  // Consult relevant external document parser
791  if (is_object($this->external_parsers[$fileExtension])) {
792  $indexingDataDto = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
793  }
794 
795  if (is_array($indexingDataDto)) {
796  trigger_error(
797  sprintf(
798  'The method %s returns an array, which is deprecated and will stop working in TYPO3 v14.0. Return an instance of %s instead.',
799  get_class($this->external_parsers[$fileExtension]) . '::readFileContent()',
800  IndexingDataAsString::class
801  ),
802  E_USER_DEPRECATED
803  );
804  $indexingDataDto = ‪IndexingDataAsString::fromArray($indexingDataDto);
805  }
806 
807  if ($indexingDataDto instanceof IndexingDataAsString) {
808  return $indexingDataDto;
809  }
810 
811  return null;
812  }
813 
821  public function fileContentParts(string $ext, string $absFile): array
822  {
823  $cParts = [0];
824  // Consult relevant external document parser:
825  if (is_object($this->external_parsers[$ext])) {
826  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
827  }
828  return $cParts;
829  }
830 
834  public function splitRegularContent(string $content): IndexingDataAsString
835  {
836  $indexingDataDto = ‪IndexingDataAsString::fromArray($this->defaultIndexingDataPayload);
837  $indexingDataDto->body = $content;
838 
839  return $indexingDataDto;
840  }
841 
842  /**********************************
843  *
844  * Analysing content, Extracting words
845  *
846  **********************************/
850  public function charsetEntity2utf8(IndexingDataAsString $indexingDataDto): void
851  {
852  // Convert charset if necessary
853  foreach ($indexingDataDto->toArray() as $key => $value) {
854  if ((string)$value !== '') {
855  // decode all numeric / html-entities in the string to real characters:
856  $indexingDataDto->{$key} = html_entity_decode($value);
857  }
858  }
859  }
860 
864  public function processWordsInArrays(IndexingDataAsString $input): IndexingDataAsArray
865  {
866  $contentArr = [];
867 
868  // split all parts to words
869  foreach ($input->toArray() as $key => $value) {
870  $contentArr[$key] = $this->lexerObj->split2Words($value);
871  }
872 
873  $indexingDataDto = ‪IndexingDataAsArray::fromArray($contentArr);
874 
875  // For title, keywords, and description we don't want duplicates
876  $indexingDataDto->title = array_unique($indexingDataDto->title);
877  $indexingDataDto->keywords = array_unique($indexingDataDto->keywords);
878  $indexingDataDto->description = array_unique($indexingDataDto->description);
879 
880  return $indexingDataDto;
881  }
882 
886  public function bodyDescription(IndexingDataAsString $indexingDataDto): string
887  {
888  $bodyDescription = '';
889  // Setting description
890  $maxL = ‪MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
891  if ($maxL) {
892  $bodyDescription = preg_replace('/\s+/u', ' ', $indexingDataDto->body);
893  // Shorten the string. If the database has the wrong character set,
894  // the string is probably truncated again.
895  $bodyDescription = \mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
896  }
897  return $bodyDescription;
898  }
899 
905  public function indexAnalyze(IndexingDataAsArray $indexingDataDto): array
906  {
907  $indexArr = [];
908  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->title, 7);
909  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->keywords, 6);
910  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->description, 5);
911  $this->analyzeBody($indexArr, $indexingDataDto);
912  return $indexArr;
913  }
914 
922  public function analyzeHeaderinfo(array &$retArr, array $content, int $offset): void
923  {
924  foreach ($content as $val) {
925  $val = mb_substr($val, 0, 60);
926  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
927  if (!isset($retArr[$val])) {
928  // Word ID (wid)
929  $retArr[$val]['hash'] = md5($val);
930  }
931  // Priority used for flagBitMask feature (see extension configuration)
932  $retArr[$val]['cmp'] = ($retArr[$val]['cmp'] ?? 0) | 2 ** $offset;
933  if (!($retArr[$val]['count'] ?? false)) {
934  $retArr[$val]['count'] = 0;
935  }
936 
937  // Increase number of occurrences
938  $retArr[$val]['count']++;
939  $this->wordcount++;
940  }
941  }
942 
948  public function analyzeBody(array &$retArr, IndexingDataAsArray $indexingDataDto): void
949  {
950  foreach ($indexingDataDto->body as $key => $val) {
951  $val = substr($val, 0, 60);
952  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
953  if (!isset($retArr[$val])) {
954  // First occurrence (used for ranking results)
955  $retArr[$val]['first'] = $key;
956  // Word ID (wid)
957  $retArr[$val]['hash'] = md5($val);
958  }
959  if (!($retArr[$val]['count'] ?? false)) {
960  $retArr[$val]['count'] = 0;
961  }
962 
963  // Increase number of occurrences
964  $retArr[$val]['count']++;
965  $this->wordcount++;
966  }
967  }
968 
969  /********************************
970  *
971  * SQL; TYPO3 Pages
972  *
973  *******************************/
977  public function submitPage(): void
978  {
979  // Remove any current data for this phash:
980  $this->removeOldIndexedPages($this->hash['phash']);
981  // setting new phash_row
982  ‪$fields = [
983  'phash' => $this->hash['phash'],
984  'phash_grouping' => $this->hash['phash_grouping'],
985  'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
986  'contentHash' => $this->content_md5h,
987  'data_page_id' => $this->conf['id'],
988  'data_page_type' => $this->conf['type'],
989  'data_page_mp' => $this->conf['MP'],
990  'gr_list' => $this->conf['gr_list'],
991  'item_type' => 0,
992  // TYPO3 page
993  'item_title' => $this->indexingDataStringDto->title,
994  'item_description' => $this->bodyDescription($this->indexingDataStringDto),
995  'item_mtime' => (int)$this->conf['mtime'],
996  'item_size' => strlen($this->conf['content']),
997  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
998  'crdate' => ‪$GLOBALS['EXEC_TIME'],
999  'item_crdate' => $this->conf['crdate'],
1000  // Creation date of page
1001  'sys_language_uid' => $this->conf['sys_language_uid'],
1002  // Sys language uid of the page. Should reflect which language it DOES actually display!
1003  'externalUrl' => 0,
1004  'recordUid' => (int)$this->conf['recordUid'],
1005  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1006  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1007  ];
1008  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1009  ->getConnectionForTable('index_phash');
1010  $connection->insert(
1011  'index_phash',
1012  ‪$fields
1013  );
1014  // PROCESSING index_section
1015  $this->submit_section($this->hash['phash'], $this->hash['phash']);
1016  // PROCESSING index_grlist
1017  $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1018  // PROCESSING index_fulltext
1019  ‪$fields = [
1020  'phash' => $this->hash['phash'],
1021  'fulltextdata' => implode(' ', $this->indexingDataStringDto->toArray()),
1022  ];
1023  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1024  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1025  }
1026  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1027  ->getConnectionForTable('index_fulltext');
1028  $connection->insert('index_fulltext', ‪$fields);
1029  // PROCESSING index_debug
1030  if ($this->indexerConfig['debugMode'] ?? false) {
1031  ‪$fields = [
1032  'phash' => $this->hash['phash'],
1033  'debuginfo' => json_encode([
1034  'external_parsers initialized' => array_keys($this->external_parsers),
1035  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1036  'contentParts' => array_merge($this->indexingDataStringDto->toArray(), ['body' => substr($this->indexingDataStringDto->body, 0, 1000)]),
1037  'logs' => $this->internal_log,
1038  'lexer' => $this->lexerObj->debugString,
1039  ], JSON_THROW_ON_ERROR),
1040  ];
1041  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1042  ->getConnectionForTable('index_debug');
1043  $connection->insert('index_debug', ‪$fields);
1044  }
1045  }
1046 
1053  public function submit_grlist(string $hash, string $phash_x): void
1054  {
1055  // Setting the gr_list record
1056  ‪$fields = [
1057  'phash' => $hash,
1058  'phash_x' => $phash_x,
1059  'hash_gr_list' => md5($this->conf['gr_list']),
1060  'gr_list' => $this->conf['gr_list'],
1061  ];
1062  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1063  ->getConnectionForTable('index_grlist');
1064  $connection->insert('index_grlist', ‪$fields);
1065  }
1066 
1074  public function submit_section(string $hash, string $hash_t3): void
1075  {
1076  ‪$fields = [
1077  'phash' => $hash,
1078  'phash_t3' => $hash_t3,
1079  'page_id' => (int)$this->conf['id'],
1080  ];
1081  $this->getRootLineFields(‪$fields);
1082  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1083  ->getConnectionForTable('index_section');
1084  $connection->insert('index_section', ‪$fields);
1085  }
1086 
1092  public function removeOldIndexedPages(string $phash): void
1093  {
1094  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1095  // there can be nothing else than 1-1 relations here.
1096  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1097  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1098  foreach ($tableArray as $table) {
1099  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => $phash]);
1100  }
1101 
1102  // Removing all index_section records with hash_t3 set to this hash (this includes such
1103  // records set for external media on the page as well!). The re-insert of these records
1104  // are done in indexRegularDocument($file).
1105  $connectionPool->getConnectionForTable('index_section')->delete('index_section', ['phash_t3' => $phash]);
1106  }
1107 
1108  /********************************
1109  *
1110  * SQL; External media
1111  *
1112  *******************************/
1125  public function submitFilePage(array $hash, string $file, array $subinfo, string $ext, int $mtime, int $ctime, int $size, string $content_md5h, IndexingDataAsString $indexingDataDto): void
1126  {
1127  // Find item Type:
1128  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1129  $storeItemType = $storeItemType ?: $ext;
1130  // Remove any current data for this phash:
1131  $this->removeOldIndexedFiles($hash['phash']);
1132  // Split filename:
1133  $fileParts = parse_url($file);
1134  // Setting new
1135  ‪$fields = [
1136  'phash' => $hash['phash'],
1137  'phash_grouping' => $hash['phash_grouping'],
1138  'static_page_arguments' => json_encode($subinfo),
1139  'contentHash' => $content_md5h,
1140  'data_filename' => $file,
1141  'item_type' => $storeItemType,
1142  'item_title' => trim($indexingDataDto->title) ?: ‪PathUtility::basename($file),
1143  'item_description' => $this->bodyDescription($indexingDataDto),
1144  'item_mtime' => $mtime,
1145  'item_size' => $size,
1146  'item_crdate' => $ctime,
1147  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1148  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1149  'gr_list' => $this->conf['gr_list'],
1150  'externalUrl' => ($fileParts['scheme'] ?? false) ? 1 : 0,
1151  'recordUid' => (int)$this->conf['recordUid'],
1152  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1153  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1154  'sys_language_uid' => (int)$this->conf['sys_language_uid'],
1155  ];
1156  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1157  ->getConnectionForTable('index_phash');
1158  $connection->insert(
1159  'index_phash',
1160  ‪$fields
1161  );
1162  // PROCESSING index_fulltext
1163  ‪$fields = [
1164  'phash' => $hash['phash'],
1165  'fulltextdata' => implode(' ', $indexingDataDto->toArray()),
1166  ];
1167  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1168  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1169  }
1170  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1171  ->getConnectionForTable('index_fulltext');
1172  $connection->insert('index_fulltext', ‪$fields);
1173  // PROCESSING index_debug
1174  if ($this->indexerConfig['debugMode'] ?? false) {
1175  ‪$fields = [
1176  'phash' => $hash['phash'],
1177  'debuginfo' => json_encode([
1178  'static_page_arguments' => $subinfo,
1179  'contentParts' => array_merge($indexingDataDto->toArray(), ['body' => substr($indexingDataDto->body, 0, 1000)]),
1180  'logs' => $this->internal_log,
1181  'lexer' => $this->lexerObj->debugString,
1182  ], JSON_THROW_ON_ERROR),
1183  ];
1184  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1185  ->getConnectionForTable('index_debug');
1186  $connection->insert('index_debug', ‪$fields);
1187  }
1188  }
1189 
1195  public function submitFile_section(string $hash): void
1196  {
1197  // Testing if there is already a section
1198  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1199  ->getQueryBuilderForTable('index_section');
1200  $count = (int)$queryBuilder->count('phash')
1201  ->from('index_section')
1202  ->where(
1203  $queryBuilder->expr()->eq(
1204  'phash',
1205  $queryBuilder->createNamedParameter($hash)
1206  ),
1207  $queryBuilder->expr()->eq(
1208  'page_id',
1209  $queryBuilder->createNamedParameter($this->conf['id'], ‪Connection::PARAM_INT)
1210  )
1211  )
1212  ->executeQuery()
1213  ->fetchOne();
1214 
1215  if ($count === 0) {
1216  $this->submit_section($hash, $this->hash['phash']);
1217  }
1218  }
1219 
1225  public function removeOldIndexedFiles(string $phash): void
1226  {
1227  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1228  // Removing old registrations for tables.
1229  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1230  foreach ($tableArray as $table) {
1231  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => $phash]);
1232  }
1233  }
1234 
1241  public function getIndexStatus(int $mtime, string $phash): ‪IndexStatus
1242  {
1243  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1244  ->select(
1245  ['item_mtime', 'tstamp'],
1246  'index_phash',
1247  ['phash' => $phash],
1248  [],
1249  [],
1250  1
1251  )
1252  ->fetchAssociative();
1253 
1254  if (empty($row)) {
1255  // Page has never been indexed (is not represented in the index_phash table).
1256  return IndexStatus::NEW_DOCUMENT;
1257  }
1258 
1259  if ($this->tstamp_maxAge && ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_maxAge) {
1260  // If max age is exceeded, index the page
1261  // The configured max-age was exceeded for the document, and thus it's indexed.
1262  return IndexStatus::MAXIMUM_AGE_EXCEEDED;
1263  }
1264 
1265  if (!$this->tstamp_minAge || ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_minAge) {
1266  // if minAge is not set or if minAge is exceeded, consider at mtime
1267  if ($mtime) {
1268  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1269  if ((int)$row['item_mtime'] !== $mtime) {
1270  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1271  // The minimum age has exceeded and mtime was set and the mtime was different, so the page was indexed.
1272  return IndexStatus::MODIFICATION_TIME_DIFFERS;
1273  }
1274 
1275  // mtime matched the document, so no changes detected and no content updated
1276  if ($this->tstamp_maxAge) {
1277  $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - ‪$GLOBALS['EXEC_TIME']) . ' seconds to expire time).', LogLevel::WARNING);
1278  } else {
1279  $this->updateTstamp($phash);
1280  $this->log_setTSlogMessage('mtime matched, timestamp updated.', LogLevel::NOTICE);
1281  }
1282  return IndexStatus::MTIME_MATCHED;
1283  }
1284 
1285  // The minimum age has exceeded, but mtime was not set, so the page was indexed.
1286  return IndexStatus::MODIFICATION_TIME_NOT_SET;
1287  }
1288 
1289  // The minimum age was not exceeded
1290  return IndexStatus::MINIMUM_AGE_NOT_EXCEEDED;
1291  }
1292 
1298  public function checkContentHash(): array|true
1299  {
1300  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1301  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1302  ->select(
1303  ['phash'],
1304  'index_phash',
1305  [
1306  'phash_grouping' => $this->hash['phash_grouping'],
1307  'contentHash' => $this->content_md5h,
1308  ],
1309  [],
1310  [],
1311  1
1312  )
1313  ->fetchAssociative();
1314 
1315  return $row ?: true;
1316  }
1317 
1325  public function checkExternalDocContentHash(string $hashGr, string $content_md5h): bool
1326  {
1327  $count = GeneralUtility::makeInstance(ConnectionPool::class)
1328  ->getConnectionForTable('index_phash')
1329  ->count(
1330  '*',
1331  'index_phash',
1332  [
1333  'phash_grouping' => $hashGr,
1334  'contentHash' => $content_md5h,
1335  ]
1336  );
1337  return $count === 0;
1338  }
1339 
1343  public function is_grlist_set(string $phash_x): bool
1344  {
1345  $count = GeneralUtility::makeInstance(ConnectionPool::class)
1346  ->getConnectionForTable('index_grlist')
1347  ->count(
1348  'phash_x',
1349  'index_grlist',
1350  ['phash_x' => $phash_x]
1351  );
1352  return $count > 0;
1353  }
1354 
1361  public function update_grlist(string $phash, string $phash_x): void
1362  {
1363  $count = GeneralUtility::makeInstance(ConnectionPool::class)
1364  ->getConnectionForTable('index_grlist')
1365  ->count(
1366  'phash',
1367  'index_grlist',
1368  [
1369  'phash' => $phash,
1370  'hash_gr_list' => md5($this->conf['gr_list']),
1371  ]
1372  );
1373 
1374  if ($count === 0) {
1375  $this->submit_grlist($phash, $phash_x);
1376  $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', LogLevel::NOTICE);
1377  }
1378  }
1379 
1383  public function updateTstamp(string $phash, int $mtime = 0): void
1384  {
1385  $updateFields = [
1386  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1387  ];
1388 
1389  if ($mtime) {
1390  $updateFields['item_mtime'] = $mtime;
1391  }
1392 
1393  GeneralUtility::makeInstance(ConnectionPool::class)
1394  ->getConnectionForTable('index_phash')
1395  ->update(
1396  'index_phash',
1397  $updateFields,
1398  [
1399  'phash' => $phash,
1400  ]
1401  );
1402  }
1403 
1407  public function updateSetId(string $phash): void
1408  {
1409  GeneralUtility::makeInstance(ConnectionPool::class)
1410  ->getConnectionForTable('index_phash')
1411  ->update(
1412  'index_phash',
1413  [
1414  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1415  ],
1416  [
1417  'phash' => $phash,
1418  ]
1419  );
1420  }
1421 
1425  public function updateParsetime(string $phash, int $parsetime): void
1426  {
1427  GeneralUtility::makeInstance(ConnectionPool::class)
1428  ->getConnectionForTable('index_phash')
1429  ->update(
1430  'index_phash',
1431  [
1432  'parsetime' => $parsetime,
1433  ],
1434  [
1435  'phash' => $phash,
1436  ]
1437  );
1438  }
1439 
1443  public function updateRootline(): void
1444  {
1445  $updateFields = [];
1446  $this->getRootLineFields($updateFields);
1447 
1448  GeneralUtility::makeInstance(ConnectionPool::class)
1449  ->getConnectionForTable('index_section')
1450  ->update(
1451  'index_section',
1452  $updateFields,
1453  [
1454  'page_id' => (int)$this->conf['id'],
1455  ]
1456  );
1457  }
1458 
1465  public function getRootLineFields(array &$fieldArray): void
1466  {
1467  $fieldArray['rl0'] = (int)($this->conf['rootline_uids'][0] ?? 0);
1468  $fieldArray['rl1'] = (int)($this->conf['rootline_uids'][1] ?? 0);
1469  $fieldArray['rl2'] = (int)($this->conf['rootline_uids'][2] ?? 0);
1470  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
1471  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1472  }
1473  }
1474 
1475  /********************************
1476  *
1477  * SQL; Submitting words
1478  *
1479  *******************************/
1485  public function checkWordList(array $wordListArray): void
1486  {
1487  if ($wordListArray === [] || ‪IndexedSearchUtility::isMysqlFullTextEnabled()) {
1488  return;
1489  }
1490 
1491  $wordListArrayCount = count($wordListArray);
1492  $phashArray = array_column($wordListArray, 'hash');
1493 
1494  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
1495  $count = (int)$queryBuilder->count('baseword')
1496  ->from('index_words')
1497  ->where(
1498  $queryBuilder->expr()->in(
1499  'wid',
1500  $queryBuilder->quoteArrayBasedValueListToStringList($phashArray)
1501  )
1502  )
1503  ->executeQuery()
1504  ->fetchOne();
1505 
1506  if ($count !== $wordListArrayCount) {
1507  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
1508  $queryBuilder = $connection->createQueryBuilder();
1509 
1510  $result = $queryBuilder->select('wid')
1511  ->from('index_words')
1512  ->where(
1513  $queryBuilder->expr()->in(
1514  'wid',
1515  $queryBuilder->quoteArrayBasedValueListToStringList($phashArray)
1516  )
1517  )
1518  ->executeQuery();
1519 
1520  $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), LogLevel::NOTICE);
1521  while ($row = $result->fetchAssociative()) {
1522  foreach ($wordListArray as $baseword => $wordData) {
1523  if ($wordData['hash'] === $row['wid']) {
1524  unset($wordListArray[$baseword]);
1525  }
1526  }
1527  }
1528 
1529  foreach ($wordListArray as $key => $val) {
1530  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
1531  // long as the words in $wl are NO longer as 60 chars (the baseword varchar is 60 characters...)
1532  // this is not a problem.
1533  $connection->insert(
1534  'index_words',
1535  [
1536  'wid' => $val['hash'],
1537  'baseword' => $key,
1538  ]
1539  );
1540  }
1541  }
1542  }
1543 
1547  public function submitWords(array $wordList, string $phash): void
1548  {
1550  return;
1551  }
1552  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1553  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
1554  $result = $queryBuilder->select('wid')
1555  ->from('index_words')
1556  ->where(
1557  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, ‪Connection::PARAM_INT))
1558  )
1559  ->groupBy('wid')
1560  ->executeQuery();
1561 
1562  $stopWords = [];
1563  while ($row = $result->fetchAssociative()) {
1564  $stopWords[$row['wid']] = $row;
1565  }
1566 
1567  $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => $phash]);
1568 
1569  ‪$fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1570  $rows = [];
1571  foreach ($wordList as $val) {
1572  if (isset($stopWords[$val['hash']])) {
1573  continue;
1574  }
1575  $rows[] = [
1576  $phash,
1577  $val['hash'],
1578  (int)$val['count'],
1579  (int)($val['first'] ?? 0),
1580  $this->freqMap($val['count'] / $this->wordcount),
1581  ($val['cmp'] ?? 0) & $this->flagBitMask,
1582  ];
1583  }
1584 
1585  if (!empty($rows)) {
1586  $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, ‪$fields);
1587  }
1588  }
1589 
1597  public function freqMap(float $freq): int
1598  {
1599  $mapFactor = $this->freqMax * 100 * $this->freqRange;
1600  if ($freq <= 1) {
1601  $newFreq = $freq * $mapFactor;
1602  $newFreq = min($newFreq, $this->freqRange);
1603  } else {
1604  $newFreq = $freq / $mapFactor;
1605  }
1606  return (int)$newFreq;
1607  }
1608 
1609  /********************************
1610  *
1611  * Hashing
1612  *
1613  *******************************/
1617  public function setT3Hashes(): void
1618  {
1619  // Set main array:
1620  $hArray = [
1621  'id' => (int)$this->conf['id'],
1622  'type' => (int)$this->conf['type'],
1623  'sys_lang' => (int)$this->conf['sys_language_uid'],
1624  'MP' => (string)$this->conf['MP'],
1625  'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1626  ];
1627  // Set grouping hash (Identifies a "page" combined of id, type, language, mount point and cHash parameters):
1628  $this->hash['phash_grouping'] = md5(serialize($hArray));
1629  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1630  $hArray['gr_list'] = (string)$this->conf['gr_list'];
1631  $this->hash['phash'] = md5(serialize($hArray));
1632  }
1633 
1641  public function setExtHashes(string $file, array $subinfo = []): array
1642  {
1643  // Set main array:
1644  $hash = [];
1645  $hArray = [
1646  'file' => $file,
1647  ];
1648  // Set grouping hash:
1649  $hash['phash_grouping'] = md5(serialize($hArray));
1650  // Add subinfo
1651  $hArray['subinfo'] = $subinfo;
1652  $hash['phash'] = md5(serialize($hArray));
1653  return $hash;
1654  }
1655 
1656  public function log_setTSlogMessage(string $msg, string $logLevel = LogLevel::INFO): void
1657  {
1658  $this->timeTracker->setTSlogMessage($msg, $logLevel);
1659 
1660  if ($this->indexerConfig['debugMode'] ?? false) {
1661  $this->internal_log[] = $msg;
1662  }
1663  }
1664 
1673  protected function addSpacesToKeywordList(string $keywordList): string
1674  {
1675  $keywords = ‪GeneralUtility::trimExplode(',', $keywordList);
1676  return ' ' . implode(', ', $keywords) . ' ';
1677  }
1678 
1686  private function removePhashCollisions(array $wordList): array
1687  {
1688  $uniquePhashes = [];
1689  foreach ($wordList as $baseword => $wordData) {
1690  if (in_array($wordData['hash'], $uniquePhashes, true)) {
1691  unset($wordList[$baseword]);
1692  continue;
1693  }
1694  $uniquePhashes[] = $wordData['hash'];
1695  }
1696  return $wordList;
1697  }
1698 
1702  protected function milliseconds(): int
1703  {
1704  return (int)round(microtime(true) * 1000);
1705  }
1706 }
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:27
‪TYPO3\CMS\Core\Database\Connection\PARAM_INT
‪const PARAM_INT
Definition: Connection.php:52
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsString\fromArray
‪static fromArray(array $input)
Definition: IndexingDataAsString.php:35
‪TYPO3\CMS\Core\Utility\PathUtility\isAbsolutePath
‪static isAbsolutePath(string $path)
Definition: PathUtility.php:286
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsString
Definition: IndexingDataAsString.php:24
‪TYPO3\CMS\Core\Html\HtmlParser
Definition: HtmlParser.php:26
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:47
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static getPublicPath()
Definition: Environment.php:187
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\isMysqlFullTextEnabled
‪static isMysqlFullTextEnabled()
Definition: IndexedSearchUtility.php:148
‪TYPO3\CMS\IndexedSearch\Type\IndexStatus
‪IndexStatus
Definition: IndexStatus.php:24
‪$fields
‪$fields
Definition: pages.php:5
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static basename(string $path)
Definition: PathUtility.php:219
‪TYPO3\CMS\Core\Utility\GeneralUtility\writeFile
‪static bool writeFile(string $file, string $content, bool $changePermissions=false)
Definition: GeneralUtility.php:1464
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static string false getUrl(string $url)
Definition: GeneralUtility.php:1439
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsArray\fromArray
‪static fromArray(array $input)
Definition: IndexingDataAsArray.php:37
‪TYPO3\CMS\Core\Http\RequestFactory
Definition: RequestFactory.php:30
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:41
‪TYPO3\CMS\Webhooks\Message\$url
‪identifier readonly UriInterface $url
Definition: LoginErrorOccurredMessage.php:36
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:25
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:41
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:24
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:46
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange(mixed $theInt, int $min, int $max=2000000000, int $defaultValue=0)
Definition: MathUtility.php:34
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsArray
Definition: IndexingDataAsArray.php:26
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:52
‪TYPO3\CMS\Core\TimeTracker\TimeTracker
Definition: TimeTracker.php:34
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility
Definition: IndexedSearchUtility.php:29
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static list< string > trimExplode(string $delim, string $string, bool $removeEmptyValues=false, int $limit=0)
Definition: GeneralUtility.php:817