‪TYPO3CMS  ‪main
Indexer.php
Go to the documentation of this file.
1 <?php
2 
3 declare(strict_types=1);
4 
5 /*
6  * This file is part of the TYPO3 CMS project.
7  *
8  * It is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU General Public License, either version 2
10  * of the License, or any later version.
11  *
12  * For the full copyright and license information, please read the
13  * LICENSE.txt file that was distributed with this source code.
14  *
15  * The TYPO3 project - inspiring people to share!
16  */
17 
19 
20 use Psr\Log\LogLevel;
35 
41 class Indexer
42 {
46  public string $excludeSections = 'script,style';
47 
51  public array $external_parsers = [];
52 
57  public int $tstamp_minAge = 0;
58 
62  public int $maxExternalFiles = 0;
63 
67  public bool $forceIndexing = false;
68 
69  public array $defaultIndexingDataPayload = [
70  'title' => '',
71  'description' => '',
72  'keywords' => '',
73  'body' => '',
74  ];
75 
76  public int $wordcount = 0;
77  public int $externalFileCounter = 0;
78  public array $conf = [];
79 
83  public array $indexerConfig = [];
84 
88  public array $hash = [];
89 
93  public array $file_phash_arr = [];
94 
95  public IndexingDataAsString $indexingDataStringDto;
96 
100  public string $content_md5h = '';
101  public string $indexExternalUrl_content = '';
102  public int $freqRange = 32000;
103  public float $freqMax = 0.1;
104  public int $flagBitMask;
105 
106  public function __construct(
107  private readonly TimeTracker $timeTracker,
108  private readonly Lexer $lexer,
109  private readonly RequestFactory $requestFactory,
110  private readonly ConnectionPool $connectionPool,
111  ExtensionConfiguration $extensionConfiguration,
112  ) {
113  // Indexer configuration from Extension Manager interface
114  $this->indexerConfig = $extensionConfiguration->get('indexed_search');
115  $this->tstamp_minAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
116  $this->maxExternalFiles = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxExternalFiles'] ?? 5), 0, 1000);
117  $this->flagBitMask = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['flagBitMask'] ?? 0), 0, 255);
118  }
119 
123  public function init(?array $configuration = null): void
124  {
125  if (is_array($configuration)) {
126  $this->conf = $configuration;
127  }
128  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
129  $this->setT3Hashes();
130  // Initialize external document parsers:
131  // Example configuration, see ext_localconf.php of this file!
132  if ($this->conf['index_externals']) {
133  $this->initializeExternalParsers();
134  }
135  }
136 
137  public function initializeExternalParsers(): void
138  {
139  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
140  $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
141  $this->external_parsers[$extension]->pObj = $this;
142  // Init parser and if it returns FALSE, unset its entry again:
143  if (!$this->external_parsers[$extension]->initParser($extension)) {
144  unset($this->external_parsers[$extension]);
145  }
146  }
147  }
148 
149  /********************************
150  *
151  * Indexing; TYPO3 pages (HTML content)
152  *
153  *******************************/
157  public function indexTypo3PageContent(): void
158  {
159  $indexStatus = $this->getIndexStatus($this->conf['mtime'], $this->hash['phash']);
160  $reindexingRequired = $indexStatus->reindexRequired();
161  $is_grlist = $this->is_grlist_set($this->hash['phash']);
162  if ($reindexingRequired || !$is_grlist || $this->forceIndexing) {
163  // Setting message:
164  if ($this->forceIndexing) {
165  $this->log_setTSlogMessage('Indexing needed, reason: Forced', LogLevel::NOTICE);
166  } elseif ($reindexingRequired) {
167  $this->log_setTSlogMessage('Indexing needed, reason: ' . $indexStatus->reason(), LogLevel::NOTICE);
168  } else {
169  $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', LogLevel::NOTICE);
170  }
171  // Divide into title,keywords,description and body:
172  $this->timeTracker->push('Split content');
173  $this->indexingDataStringDto = $this->splitHTMLContent($this->conf['content']);
174  if ($this->conf['indexedDocTitle']) {
175  $this->indexingDataStringDto->title = $this->conf['indexedDocTitle'];
176  }
177  $this->timeTracker->pull();
178  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
179  $this->content_md5h = md5(implode('', $this->indexingDataStringDto->toArray()));
180  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
181  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
182  // This will also prevent pages from being indexed if a fe_users has logged in, and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
183  $checkCHash = $this->checkContentHash();
184  if (!is_array($checkCHash) || $reindexingRequired) {
185  $Pstart = $this->milliseconds();
186  $this->timeTracker->push('Converting entities of content');
187  $this->charsetEntity2utf8($this->indexingDataStringDto);
188  $this->timeTracker->pull();
189 
190  // Splitting words
191  $this->timeTracker->push('Extract words from content');
192  $splitInWords = $this->processWordsInArrays($this->indexingDataStringDto);
193  $this->timeTracker->pull();
194 
195  // Analyze the indexed words.
196  $this->timeTracker->push('Analyze the extracted words');
197  $indexArr = $this->indexAnalyze($splitInWords);
198  $this->timeTracker->pull();
199 
200  // Submitting page (phash) record
201  $this->timeTracker->push('Submitting page');
202  $this->submitPage();
203  $this->timeTracker->pull();
204 
205  // Check words and submit to word list if not there
206  $this->timeTracker->push('Check word list and submit words');
208  $indexArr = $this->removePhashCollisions($indexArr);
209  $this->checkWordList($indexArr);
210  $this->submitWords($indexArr, $this->hash['phash']);
211  }
212  $this->timeTracker->pull();
213 
214  // Set parse time
215  $this->updateParsetime($this->hash['phash'], $this->milliseconds() - $Pstart);
216 
217  // Checking external files if configured for.
218  if ($this->conf['index_externals']) {
219  $this->timeTracker->push('Checking external files', '');
220  $this->extractLinks($this->conf['content']);
221  $this->timeTracker->pull();
222  }
223  } else {
224  // Update the timestamp
225  $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
226  $this->updateSetId($this->hash['phash']);
227 
228  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
229  $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
230  $this->updateRootline();
231  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
232  }
233  } else {
234  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $indexStatus->reason());
235  }
236  }
237 
243  public function splitHTMLContent(string $content): IndexingDataAsString
244  {
245  $indexingDataDto = ‪IndexingDataAsString::fromArray($this->defaultIndexingDataPayload);
246  $indexingDataDto->body = stristr($content, '<body') ?: '';
247  $headPart = substr($content, 0, -strlen($indexingDataDto->body));
248  // get title
249  $this->embracingTags($headPart, 'TITLE', $indexingDataDto->title, $dummy2, $dummy);
250  $titleParts = explode(':', $indexingDataDto->title, 2);
251  $indexingDataDto->title = trim($titleParts[1] ?? $titleParts[0]);
252  // get keywords and description meta tags
253  if ($this->conf['index_metatags']) {
254  $meta = [];
255  $i = 0;
256  while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
257  $i++;
258  }
259  // @todo The code below stops at first unset tag. Is that correct?
260  for ($i = 0; isset($meta[$i]); $i++) {
261  // decode HTML entities, meta tag content needs to be encoded later
262  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
263  if (stripos(($meta[$i]['name'] ?? ''), 'keywords') !== false) {
264  $indexingDataDto->keywords .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
265  }
266  if (stripos(($meta[$i]['name'] ?? ''), 'description') !== false) {
267  $indexingDataDto->description .= ',' . $meta[$i]['content'];
268  }
269  }
270  }
271  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
272  $this->typoSearchTags($indexingDataDto->body);
273  // Get rid of unwanted sections (i.e. scripting and style stuff) in body
274  $tagList = explode(',', $this->excludeSections);
275  foreach ($tagList as $tag) {
276  while ($this->embracingTags($indexingDataDto->body, $tag, $dummy, $indexingDataDto->body, $dummy2)) {
277  }
278  }
279  // remove tags, but first make sure we don't concatenate words by doing it
280  $indexingDataDto->body = str_replace('<', ' <', $indexingDataDto->body);
281  $indexingDataDto->body = trim(strip_tags($indexingDataDto->body));
282  $indexingDataDto->keywords = trim($indexingDataDto->keywords);
283  $indexingDataDto->description = trim($indexingDataDto->description);
284 
285  return $indexingDataDto;
286  }
287 
291  public function getHTMLcharset(string $content): string
292  {
293  // @todo: Use \DOMDocument and DOMXpath
294  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)
295  && preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)
296  ) {
297  return $reg2[1];
298  }
299 
300  return '';
301  }
302 
306  public function convertHTMLToUtf8(string $content, string $charset = ''): string
307  {
308  // Find charset
309  $charset = $charset ?: $this->getHTMLcharset($content);
310  $charset = strtolower(trim($charset));
311  // Convert charset
312  if ($charset && $charset !== 'utf-8') {
313  $content = mb_convert_encoding($content, 'utf-8', $charset);
314  }
315  // Convert entities, assuming document is now UTF-8
316  return html_entity_decode($content);
317  }
318 
330  public function embracingTags(string $string, string $tagName, ?string &$tagContent, ?string &$stringAfter, ?string &$paramList): bool
331  {
332  $endTag = '</' . $tagName . '>';
333  $startTag = '<' . $tagName;
334  // stristr used because we want a case-insensitive search for the tag.
335  $isTagInText = stristr($string, $startTag);
336  // if the tag was not found, return FALSE
337  if (!$isTagInText) {
338  return false;
339  }
340  [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
341  $afterTagInText = stristr($isTagInText, $endTag);
342  if ($afterTagInText) {
343  $stringBefore = substr($string, 0, (int)stripos($string, $startTag));
344  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
345  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
346  } else {
347  $tagContent = '';
348  $stringAfter = $isTagInText;
349  }
350  return true;
351  }
352 
359  public function typoSearchTags(string &$body): bool
360  {
361  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
362  $expBody = $expBody ?: [];
363  if (count($expBody) > 1) {
364  $body = '';
365  $prev = '';
366  foreach ($expBody as $val) {
367  $part = explode('-->', $val, 2);
368  if (trim($part[0]) === 'begin') {
369  $body .= $part[1];
370  $prev = '';
371  } elseif (trim($part[0]) === 'end') {
372  $body .= $prev;
373  } else {
374  $prev = $val;
375  }
376  }
377  return true;
378  }
379  return false;
380  }
381 
385  public function extractLinks(string $content): void
386  {
387  // Get links:
388  $list = $this->extractHyperLinks($content);
389  // Traverse links:
390  foreach ($list as $linkInfo) {
391  // Decode entities:
392  if ($linkInfo['localPath']) {
393  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
394  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
395  } else {
396  $linkSource = htmlspecialchars_decode($linkInfo['href']);
397  }
398  // Parse URL:
399  $qParts = parse_url($linkSource);
400  // Check for jumpurl (TYPO3 specific thing...)
401  if (($qParts['query'] ?? false) && str_contains($qParts['query'], 'jumpurl=')) {
402  parse_str($qParts['query'], $getP);
403  $linkSource = $getP['jumpurl'];
404  $qParts = parse_url($linkSource);
405  }
406  if (!$linkInfo['localPath'] && ($qParts['scheme'] ?? false)) {
407  if ($this->indexerConfig['indexExternalURLs']) {
408  // Index external URL (http or otherwise)
409  $this->indexExternalUrl($linkSource);
410  }
411  } elseif (!($qParts['query'] ?? false)) {
412  $linkSource = urldecode($linkSource);
413  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
414  $localFile = $linkSource;
415  } else {
416  $localFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $linkSource);
417  }
418  if ($localFile && @is_file($localFile)) {
419  // Index local file:
420  if ($linkInfo['localPath']) {
421  $fI = pathinfo($linkSource);
422  $ext = strtolower($fI['extension']);
423  $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
424  } else {
425  $this->indexRegularDocument($linkSource);
426  }
427  }
428  }
429  }
430  }
431 
437  public function extractHyperLinks(string $html): array
438  {
439  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
440  $htmlParts = $htmlParser->splitTags('a', $html);
441  $hyperLinksData = [];
442  foreach ($htmlParts as $index => $tagData) {
443  if ($index % 2 !== 0) {
444  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
445  $firstTagName = $htmlParser->getFirstTagName($tagData);
446  if (strtolower($firstTagName) === 'a') {
447  if (!empty($tagAttributes[0]['href']) && !str_starts_with($tagAttributes[0]['href'], '#')) {
448  $hyperLinksData[] = [
449  'tag' => $tagData,
450  'href' => $tagAttributes[0]['href'],
451  'localPath' => $this->createLocalPath(urldecode($tagAttributes[0]['href'])),
452  ];
453  }
454  }
455  }
456  }
457  return $hyperLinksData;
458  }
459 
463  public function extractBaseHref(string $html): string
464  {
465  $href = '';
466  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
467  $htmlParts = $htmlParser->splitTags('base', $html);
468  foreach ($htmlParts as $index => $tagData) {
469  if ($index % 2 !== 0) {
470  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
471  $firstTagName = $htmlParser->getFirstTagName($tagData);
472  if (strtolower($firstTagName) === 'base') {
473  $href = $tagAttributes[0]['href'];
474  if ($href) {
475  break;
476  }
477  }
478  }
479  }
480  return $href;
481  }
482 
483  /******************************************
484  *
485  * Indexing; external URL
486  *
487  ******************************************/
493  public function indexExternalUrl(string $externalUrl): void
494  {
495  // Get headers:
496  $urlHeaders = $this->getUrlHeaders($externalUrl);
497  if (is_array($urlHeaders) && stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
498  $content = ($this->indexExternalUrl_content = ‪GeneralUtility::getUrl($externalUrl));
499  if ((string)$content !== '') {
500  // Create temporary file:
501  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
502  ‪GeneralUtility::writeFile($tmpFile, $content);
503  // Index that file:
504  $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
505  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
506  unlink($tmpFile);
507  }
508  }
509  }
510 
517  public function getUrlHeaders(string ‪$url): array|false
518  {
519  try {
520  $response = $this->requestFactory->request(‪$url, 'HEAD');
521  $headers = $response->getHeaders();
522  $retVal = [];
523  foreach ($headers as $key => $value) {
524  $retVal[$key] = implode('', $value);
525  }
526  return $retVal;
527  } catch (\Exception $e) {
528  // fail silently if the HTTP request failed
529  return false;
530  }
531  }
532 
538  protected function createLocalPath(string $sourcePath): string
539  {
540  $localPath = $this->createLocalPathUsingAbsRefPrefix($sourcePath);
541  if ($localPath !== '') {
542  return $localPath;
543  }
544  $localPath = $this->createLocalPathUsingDomainURL($sourcePath);
545  if ($localPath !== '') {
546  return $localPath;
547  }
548  $localPath = $this->createLocalPathFromAbsoluteURL($sourcePath);
549  if ($localPath !== '') {
550  return $localPath;
551  }
552  return $this->createLocalPathFromRelativeURL($sourcePath);
553  }
554 
558  protected function createLocalPathUsingDomainURL(string $sourcePath): string
559  {
560  $localPath = '';
561  $baseURL = ‪$GLOBALS['TYPO3_REQUEST']->getAttribute('normalizedParams')->getSiteUrl();
562  $baseURLLength = strlen($baseURL);
563  if (str_starts_with($sourcePath, $baseURL)) {
564  $sourcePath = substr($sourcePath, $baseURLLength);
565  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
566  if (!self::isAllowedLocalFile($localPath)) {
567  $localPath = '';
568  }
569  }
570  return $localPath;
571  }
572 
577  protected function createLocalPathUsingAbsRefPrefix(string $sourcePath): string
578  {
579  $localPath = '';
580  $request = ‪$GLOBALS['TYPO3_REQUEST'];
581  $frontendTypoScriptConfigArray = $request->getAttribute('frontend.typoscript')?->getConfigArray();
582  if ($frontendTypoScriptConfigArray) {
583  $absRefPrefix = $frontendTypoScriptConfigArray['absRefPrefix'] ?? '';
584  $absRefPrefixLength = strlen($absRefPrefix);
585  if ($absRefPrefixLength > 0 && str_starts_with($sourcePath, $absRefPrefix)) {
586  $sourcePath = substr($sourcePath, $absRefPrefixLength);
587  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
588  if (!self::isAllowedLocalFile($localPath)) {
589  $localPath = '';
590  }
591  }
592  }
593  return $localPath;
594  }
595 
599  protected function createLocalPathFromAbsoluteURL(string $sourcePath): string
600  {
601  $localPath = '';
602  if (str_starts_with($sourcePath, '/')) {
603  $sourcePath = substr($sourcePath, 1);
604  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
605  if (!self::isAllowedLocalFile($localPath)) {
606  $localPath = '';
607  }
608  }
609  return $localPath;
610  }
611 
615  protected function createLocalPathFromRelativeURL(string $sourcePath): string
616  {
617  $localPath = '';
618  if (self::isRelativeURL($sourcePath)) {
619  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
620  if (!self::isAllowedLocalFile($localPath)) {
621  $localPath = '';
622  }
623  }
624  return $localPath;
625  }
626 
630  protected static function isRelativeURL(string ‪$url): bool
631  {
632  $urlParts = @parse_url(‪$url);
633  return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && !str_starts_with(($urlParts['path'][0] ?? ''), '/');
634  }
635 
639  protected static function isAllowedLocalFile(string $filePath): bool
640  {
641  $filePath = GeneralUtility::resolveBackPath($filePath);
642  $insideWebPath = str_starts_with($filePath, ‪Environment::getPublicPath());
643  $isFile = is_file($filePath);
644  return $insideWebPath && $isFile;
645  }
646 
647  /******************************************
648  *
649  * Indexing; external files (PDF, DOC, etc)
650  *
651  ******************************************/
660  public function indexRegularDocument(string $file, bool $force = false, string $contentTmpFile = '', string $altExtension = ''): void
661  {
662  $fI = pathinfo($file);
663  $ext = $altExtension ?: strtolower($fI['extension']);
664  // Create abs-path
665  if (!$contentTmpFile) {
666  if (!‪PathUtility::isAbsolutePath($file)) {
667  // Relative, prepend public web path:
668  $absFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $file);
669  } else {
670  // Absolute, pass-through:
671  $absFile = $file;
672  }
673  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
674  } else {
675  $absFile = $contentTmpFile;
676  }
677  // Indexing the document:
678  if ($absFile && @is_file($absFile)) {
679  if ($this->external_parsers[$ext] ?? false) {
680  $fileInfo = stat($absFile);
681  $cParts = $this->fileContentParts($ext, $absFile);
682  foreach ($cParts as $cPKey) {
683  $this->timeTracker->push('Index: ' . str_replace('.', '_', ‪PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''));
684  $Pstart = $this->milliseconds();
685  $subinfo = ['key' => $cPKey];
686  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
687  $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
688  $indexStatus = $this->getIndexStatus($fileInfo['mtime'], $phash_arr['phash']);
689  $reindexingRequired = $indexStatus->reindexRequired();
690  if ($reindexingRequired || $force) {
691  if ($reindexingRequired) {
692  $this->log_setTSlogMessage('Indexing needed, reason: ' . $indexStatus->reason(), LogLevel::NOTICE);
693  } else {
694  $this->log_setTSlogMessage('Indexing forced by flag', LogLevel::NOTICE);
695  }
696  // Check external file counter:
697  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
698  // Divide into title,keywords,description and body:
699  $this->timeTracker->push('Split content');
700  $indexingDataDtoAsString = $this->readFileContent($ext, $absFile, $cPKey);
701  $this->timeTracker->pull();
702  if ($indexingDataDtoAsString !== null) {
703  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
704  $content_md5h = md5(implode('', $indexingDataDtoAsString->toArray()));
705  if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
706  // Increment counter:
707  $this->externalFileCounter++;
708 
709  // Splitting words
710  $this->timeTracker->push('Extract words from content');
711  $splitInWords = $this->processWordsInArrays($indexingDataDtoAsString);
712  $this->timeTracker->pull();
713 
714  // Analyze the indexed words.
715  $this->timeTracker->push('Analyze the extracted words');
716  $indexArr = $this->indexAnalyze($splitInWords);
717  $this->timeTracker->pull();
718 
719  // Submitting page (phash) record
720  $this->timeTracker->push('Submitting page');
721 
722  // Unfortunately the original creation time cannot be determined, therefore we fall back to the modification date
723  $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $indexingDataDtoAsString);
724  $this->timeTracker->pull();
725 
726  // Check words and submit to word list if not there
727  $this->timeTracker->push('Check word list and submit words');
729  $indexArr = $this->removePhashCollisions($indexArr);
730  $this->checkWordList($indexArr);
731  $this->submitWords($indexArr, $phash_arr['phash']);
732  }
733  $this->timeTracker->pull();
734 
735  // Set parsetime
736  $this->updateParsetime($phash_arr['phash'], $this->milliseconds() - $Pstart);
737  } else {
738  // Update the timestamp
739  $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
740  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
741  }
742  } else {
743  $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
744  }
745  } else {
746  $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
747  }
748  } else {
749  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $indexStatus->reason());
750  }
751  // Checking and setting sections:
752  $this->submitFile_section($phash_arr['phash']);
753  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
754  $this->timeTracker->pull();
755  }
756  } else {
757  $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
758  }
759  } else {
760  $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
761  }
762  }
763 
772  public function readFileContent(string $fileExtension, string $absoluteFileName, string|int $sectionPointer): ?IndexingDataAsString
773  {
774  $indexingDataDto = null;
775  // Consult relevant external document parser
776  if (is_object($this->external_parsers[$fileExtension])) {
777  $indexingDataDto = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
778  }
779 
780  if (is_array($indexingDataDto)) {
781  trigger_error(
782  sprintf(
783  'The method %s returns an array, which is deprecated and will stop working in TYPO3 v14.0. Return an instance of %s instead.',
784  get_class($this->external_parsers[$fileExtension]) . '::readFileContent()',
785  IndexingDataAsString::class
786  ),
787  E_USER_DEPRECATED
788  );
789  $indexingDataDto = ‪IndexingDataAsString::fromArray($indexingDataDto);
790  }
791 
792  if ($indexingDataDto instanceof IndexingDataAsString) {
793  return $indexingDataDto;
794  }
795 
796  return null;
797  }
798 
806  public function fileContentParts(string $ext, string $absFile): array
807  {
808  $cParts = [0];
809  // Consult relevant external document parser:
810  if (is_object($this->external_parsers[$ext])) {
811  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
812  }
813  return $cParts;
814  }
815 
819  public function splitRegularContent(string $content): IndexingDataAsString
820  {
821  $indexingDataDto = ‪IndexingDataAsString::fromArray($this->defaultIndexingDataPayload);
822  $indexingDataDto->body = $content;
823 
824  return $indexingDataDto;
825  }
826 
827  /**********************************
828  *
829  * Analysing content, Extracting words
830  *
831  **********************************/
835  public function charsetEntity2utf8(IndexingDataAsString $indexingDataDto): void
836  {
837  // Convert charset if necessary
838  foreach ($indexingDataDto->toArray() as $key => $value) {
839  if ((string)$value !== '') {
840  // decode all numeric / html-entities in the string to real characters:
841  $indexingDataDto->{$key} = html_entity_decode($value);
842  }
843  }
844  }
845 
849  public function processWordsInArrays(IndexingDataAsString $input): IndexingDataAsArray
850  {
851  $contentArr = [];
852 
853  // split all parts to words
854  foreach ($input->toArray() as $key => $value) {
855  $contentArr[$key] = $this->lexer->split2Words($value);
856  }
857 
858  $indexingDataDto = ‪IndexingDataAsArray::fromArray($contentArr);
859 
860  // For title, keywords, and description we don't want duplicates
861  $indexingDataDto->title = array_unique($indexingDataDto->title);
862  $indexingDataDto->keywords = array_unique($indexingDataDto->keywords);
863  $indexingDataDto->description = array_unique($indexingDataDto->description);
864 
865  return $indexingDataDto;
866  }
867 
871  public function bodyDescription(IndexingDataAsString $indexingDataDto): string
872  {
873  $bodyDescription = '';
874  // Setting description
875  $maxL = ‪MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
876  if ($maxL) {
877  $bodyDescription = preg_replace('/\s+/u', ' ', $indexingDataDto->body);
878  // Shorten the string. If the database has the wrong character set,
879  // the string is probably truncated again.
880  $bodyDescription = \mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
881  }
882  return $bodyDescription;
883  }
884 
890  public function indexAnalyze(IndexingDataAsArray $indexingDataDto): array
891  {
892  $indexArr = [];
893  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->title, 7);
894  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->keywords, 6);
895  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->description, 5);
896  $this->analyzeBody($indexArr, $indexingDataDto);
897  return $indexArr;
898  }
899 
907  public function analyzeHeaderinfo(array &$retArr, array $content, int $offset): void
908  {
909  foreach ($content as $val) {
910  $val = mb_substr($val, 0, 60);
911  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
912  if (!isset($retArr[$val])) {
913  // Word ID (wid)
914  $retArr[$val]['hash'] = md5($val);
915  }
916  // Priority used for flagBitMask feature (see extension configuration)
917  $retArr[$val]['cmp'] = ($retArr[$val]['cmp'] ?? 0) | 2 ** $offset;
918  if (!($retArr[$val]['count'] ?? false)) {
919  $retArr[$val]['count'] = 0;
920  }
921 
922  // Increase number of occurrences
923  $retArr[$val]['count']++;
924  $this->wordcount++;
925  }
926  }
927 
933  public function analyzeBody(array &$retArr, IndexingDataAsArray $indexingDataDto): void
934  {
935  foreach ($indexingDataDto->body as $key => $val) {
936  $val = substr($val, 0, 60);
937  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
938  if (!isset($retArr[$val])) {
939  // First occurrence (used for ranking results)
940  $retArr[$val]['first'] = $key;
941  // Word ID (wid)
942  $retArr[$val]['hash'] = md5($val);
943  }
944  if (!($retArr[$val]['count'] ?? false)) {
945  $retArr[$val]['count'] = 0;
946  }
947 
948  // Increase number of occurrences
949  $retArr[$val]['count']++;
950  $this->wordcount++;
951  }
952  }
953 
954  /********************************
955  *
956  * SQL; TYPO3 Pages
957  *
958  *******************************/
962  public function submitPage(): void
963  {
964  // Remove any current data for this phash:
965  $this->removeOldIndexedPages($this->hash['phash']);
966  // setting new phash_row
967  ‪$fields = [
968  'phash' => $this->hash['phash'],
969  'phash_grouping' => $this->hash['phash_grouping'],
970  'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
971  'contentHash' => $this->content_md5h,
972  'data_page_id' => $this->conf['id'],
973  'data_page_type' => $this->conf['type'],
974  'data_page_mp' => $this->conf['MP'],
975  'gr_list' => $this->conf['gr_list'],
976  'item_type' => 0,
977  // TYPO3 page
978  'item_title' => $this->indexingDataStringDto->title,
979  'item_description' => $this->bodyDescription($this->indexingDataStringDto),
980  'item_mtime' => (int)$this->conf['mtime'],
981  'item_size' => strlen($this->conf['content']),
982  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
983  'crdate' => ‪$GLOBALS['EXEC_TIME'],
984  'item_crdate' => $this->conf['crdate'],
985  // Creation date of page
986  'sys_language_uid' => $this->conf['sys_language_uid'],
987  // Sys language uid of the page. Should reflect which language it DOES actually display!
988  'externalUrl' => 0,
989  'recordUid' => (int)$this->conf['recordUid'],
990  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
991  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
992  ];
993  $connection = $this->connectionPool->getConnectionForTable('index_phash');
994  $connection->insert(
995  'index_phash',
996  ‪$fields
997  );
998  // PROCESSING index_section
999  $this->submit_section($this->hash['phash'], $this->hash['phash']);
1000  // PROCESSING index_grlist
1001  $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1002  // PROCESSING index_fulltext
1003  ‪$fields = [
1004  'phash' => $this->hash['phash'],
1005  'fulltextdata' => implode(' ', $this->indexingDataStringDto->toArray()),
1006  ];
1007  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1008  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1009  }
1010  $connection = $this->connectionPool->getConnectionForTable('index_fulltext');
1011  $connection->insert('index_fulltext', ‪$fields);
1012  }
1013 
1020  public function submit_grlist(string $hash, string $phash_x): void
1021  {
1022  // Setting the gr_list record
1023  ‪$fields = [
1024  'phash' => $hash,
1025  'phash_x' => $phash_x,
1026  'hash_gr_list' => md5($this->conf['gr_list']),
1027  'gr_list' => $this->conf['gr_list'],
1028  ];
1029  $connection = $this->connectionPool->getConnectionForTable('index_grlist');
1030  $connection->insert('index_grlist', ‪$fields);
1031  }
1032 
1040  public function submit_section(string $hash, string $hash_t3): void
1041  {
1042  ‪$fields = [
1043  'phash' => $hash,
1044  'phash_t3' => $hash_t3,
1045  'page_id' => (int)$this->conf['id'],
1046  ];
1047  $this->getRootLineFields(‪$fields);
1048  $connection = $this->connectionPool->getConnectionForTable('index_section');
1049  $connection->insert('index_section', ‪$fields);
1050  }
1051 
1057  public function removeOldIndexedPages(string $phash): void
1058  {
1059  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1060  // there can be nothing else than 1-1 relations here.
1061  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext'];
1062  foreach ($tableArray as $table) {
1063  $this->connectionPool->getConnectionForTable($table)->delete($table, ['phash' => $phash]);
1064  }
1065 
1066  // Removing all index_section records with hash_t3 set to this hash (this includes such
1067  // records set for external media on the page as well!). The re-insert of these records
1068  // are done in indexRegularDocument($file).
1069  $this->connectionPool->getConnectionForTable('index_section')->delete('index_section', ['phash_t3' => $phash]);
1070  }
1071 
1072  /********************************
1073  *
1074  * SQL; External media
1075  *
1076  *******************************/
1089  public function submitFilePage(array $hash, string $file, array $subinfo, string $ext, int $mtime, int $ctime, int $size, string $content_md5h, IndexingDataAsString $indexingDataDto): void
1090  {
1091  // Find item Type:
1092  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1093  $storeItemType = $storeItemType ?: $ext;
1094  // Remove any current data for this phash:
1095  $this->removeOldIndexedFiles($hash['phash']);
1096  // Split filename:
1097  $fileParts = parse_url($file);
1098  // Setting new
1099  ‪$fields = [
1100  'phash' => $hash['phash'],
1101  'phash_grouping' => $hash['phash_grouping'],
1102  'static_page_arguments' => json_encode($subinfo),
1103  'contentHash' => $content_md5h,
1104  'data_filename' => $file,
1105  'item_type' => $storeItemType,
1106  'item_title' => trim($indexingDataDto->title) ?: ‪PathUtility::basename($file),
1107  'item_description' => $this->bodyDescription($indexingDataDto),
1108  'item_mtime' => $mtime,
1109  'item_size' => $size,
1110  'item_crdate' => $ctime,
1111  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1112  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1113  'gr_list' => $this->conf['gr_list'],
1114  'externalUrl' => ($fileParts['scheme'] ?? false) ? 1 : 0,
1115  'recordUid' => (int)$this->conf['recordUid'],
1116  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1117  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1118  'sys_language_uid' => (int)$this->conf['sys_language_uid'],
1119  ];
1120  $connection = $this->connectionPool->getConnectionForTable('index_phash');
1121  $connection->insert(
1122  'index_phash',
1123  ‪$fields
1124  );
1125  // PROCESSING index_fulltext
1126  ‪$fields = [
1127  'phash' => $hash['phash'],
1128  'fulltextdata' => implode(' ', $indexingDataDto->toArray()),
1129  ];
1130  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1131  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1132  }
1133  $connection = $this->connectionPool->getConnectionForTable('index_fulltext');
1134  $connection->insert('index_fulltext', ‪$fields);
1135  }
1136 
1142  public function submitFile_section(string $hash): void
1143  {
1144  // Testing if there is already a section
1145  $queryBuilder = $this->connectionPool->getQueryBuilderForTable('index_section');
1146  $count = (int)$queryBuilder->count('phash')
1147  ->from('index_section')
1148  ->where(
1149  $queryBuilder->expr()->eq(
1150  'phash',
1151  $queryBuilder->createNamedParameter($hash)
1152  ),
1153  $queryBuilder->expr()->eq(
1154  'page_id',
1155  $queryBuilder->createNamedParameter($this->conf['id'], ‪Connection::PARAM_INT)
1156  )
1157  )
1158  ->executeQuery()
1159  ->fetchOne();
1160 
1161  if ($count === 0) {
1162  $this->submit_section($hash, $this->hash['phash']);
1163  }
1164  }
1165 
1171  public function removeOldIndexedFiles(string $phash): void
1172  {
1173  // Removing old registrations for tables.
1174  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext'];
1175  foreach ($tableArray as $table) {
1176  $this->connectionPool->getConnectionForTable($table)->delete($table, ['phash' => $phash]);
1177  }
1178  }
1179 
1186  public function getIndexStatus(int $mtime, string $phash): ‪IndexStatus
1187  {
1188  $row = $this->connectionPool->getConnectionForTable('index_phash')
1189  ->select(
1190  ['item_mtime', 'tstamp'],
1191  'index_phash',
1192  ['phash' => $phash],
1193  [],
1194  [],
1195  1
1196  )
1197  ->fetchAssociative();
1198 
1199  if (empty($row)) {
1200  // Page has never been indexed (is not represented in the index_phash table).
1201  return IndexStatus::NEW_DOCUMENT;
1202  }
1203 
1204  if (!$this->tstamp_minAge || ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_minAge) {
1205  // if minAge is not set or if minAge is exceeded, consider at mtime
1206  if ($mtime) {
1207  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1208  if ((int)$row['item_mtime'] !== $mtime) {
1209  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1210  // The minimum age has exceeded and mtime was set and the mtime was different, so the page was indexed.
1211  return IndexStatus::MODIFICATION_TIME_DIFFERS;
1212  }
1213 
1214  // mtime matched the document, so no changes detected and no content updated
1215  $this->updateTstamp($phash);
1216  $this->log_setTSlogMessage('mtime matched, timestamp updated.', LogLevel::NOTICE);
1217  return IndexStatus::MTIME_MATCHED;
1218  }
1219 
1220  // The minimum age has exceeded, but mtime was not set, so the page was indexed.
1221  return IndexStatus::MODIFICATION_TIME_NOT_SET;
1222  }
1223 
1224  // The minimum age was not exceeded
1225  return IndexStatus::MINIMUM_AGE_NOT_EXCEEDED;
1226  }
1227 
1233  public function checkContentHash(): array|true
1234  {
1235  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1236  $row = $this->connectionPool->getConnectionForTable('index_phash')
1237  ->select(
1238  ['phash'],
1239  'index_phash',
1240  [
1241  'phash_grouping' => $this->hash['phash_grouping'],
1242  'contentHash' => $this->content_md5h,
1243  ],
1244  [],
1245  [],
1246  1
1247  )
1248  ->fetchAssociative();
1249 
1250  return $row ?: true;
1251  }
1252 
1260  public function checkExternalDocContentHash(string $hashGr, string $content_md5h): bool
1261  {
1262  $count = $this->connectionPool->getConnectionForTable('index_phash')
1263  ->count(
1264  '*',
1265  'index_phash',
1266  [
1267  'phash_grouping' => $hashGr,
1268  'contentHash' => $content_md5h,
1269  ]
1270  );
1271  return $count === 0;
1272  }
1273 
1277  public function is_grlist_set(string $phash_x): bool
1278  {
1279  $count = $this->connectionPool->getConnectionForTable('index_grlist')
1280  ->count(
1281  'phash_x',
1282  'index_grlist',
1283  ['phash_x' => $phash_x]
1284  );
1285  return $count > 0;
1286  }
1287 
1294  public function update_grlist(string $phash, string $phash_x): void
1295  {
1296  $count = $this->connectionPool->getConnectionForTable('index_grlist')
1297  ->count(
1298  'phash',
1299  'index_grlist',
1300  [
1301  'phash' => $phash,
1302  'hash_gr_list' => md5($this->conf['gr_list']),
1303  ]
1304  );
1305 
1306  if ($count === 0) {
1307  $this->submit_grlist($phash, $phash_x);
1308  $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', LogLevel::NOTICE);
1309  }
1310  }
1311 
1315  public function updateTstamp(string $phash, int $mtime = 0): void
1316  {
1317  $updateFields = [
1318  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1319  ];
1320 
1321  if ($mtime) {
1322  $updateFields['item_mtime'] = $mtime;
1323  }
1324 
1325  $this->connectionPool->getConnectionForTable('index_phash')
1326  ->update(
1327  'index_phash',
1328  $updateFields,
1329  [
1330  'phash' => $phash,
1331  ]
1332  );
1333  }
1334 
1338  public function updateSetId(string $phash): void
1339  {
1340  $this->connectionPool->getConnectionForTable('index_phash')
1341  ->update(
1342  'index_phash',
1343  [
1344  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1345  ],
1346  [
1347  'phash' => $phash,
1348  ]
1349  );
1350  }
1351 
1355  public function updateParsetime(string $phash, int $parsetime): void
1356  {
1357  $this->connectionPool->getConnectionForTable('index_phash')
1358  ->update(
1359  'index_phash',
1360  [
1361  'parsetime' => $parsetime,
1362  ],
1363  [
1364  'phash' => $phash,
1365  ]
1366  );
1367  }
1368 
1372  public function updateRootline(): void
1373  {
1374  $updateFields = [];
1375  $this->getRootLineFields($updateFields);
1376 
1377  $this->connectionPool->getConnectionForTable('index_section')
1378  ->update(
1379  'index_section',
1380  $updateFields,
1381  [
1382  'page_id' => (int)$this->conf['id'],
1383  ]
1384  );
1385  }
1386 
1393  public function getRootLineFields(array &$fieldArray): void
1394  {
1395  $fieldArray['rl0'] = (int)($this->conf['rootline_uids'][0] ?? 0);
1396  $fieldArray['rl1'] = (int)($this->conf['rootline_uids'][1] ?? 0);
1397  $fieldArray['rl2'] = (int)($this->conf['rootline_uids'][2] ?? 0);
1398  }
1399 
1400  /********************************
1401  *
1402  * SQL; Submitting words
1403  *
1404  *******************************/
1410  public function checkWordList(array $wordListArray): void
1411  {
1412  if ($wordListArray === [] || ‪IndexedSearchUtility::isMysqlFullTextEnabled()) {
1413  return;
1414  }
1415 
1416  $wordListArrayCount = count($wordListArray);
1417  $phashArray = array_column($wordListArray, 'hash');
1418 
1419  $queryBuilder = $this->connectionPool->getQueryBuilderForTable('index_words');
1420  $count = (int)$queryBuilder->count('baseword')
1421  ->from('index_words')
1422  ->where(
1423  $queryBuilder->expr()->in(
1424  'wid',
1425  $queryBuilder->quoteArrayBasedValueListToStringList($phashArray)
1426  )
1427  )
1428  ->executeQuery()
1429  ->fetchOne();
1430 
1431  if ($count !== $wordListArrayCount) {
1432  $connection = $this->connectionPool->getConnectionForTable('index_words');
1433  $queryBuilder = $connection->createQueryBuilder();
1434 
1435  $result = $queryBuilder->select('wid')
1436  ->from('index_words')
1437  ->where(
1438  $queryBuilder->expr()->in(
1439  'wid',
1440  $queryBuilder->quoteArrayBasedValueListToStringList($phashArray)
1441  )
1442  )
1443  ->executeQuery();
1444 
1445  $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), LogLevel::NOTICE);
1446  while ($row = $result->fetchAssociative()) {
1447  foreach ($wordListArray as $baseword => $wordData) {
1448  if ($wordData['hash'] === $row['wid']) {
1449  unset($wordListArray[$baseword]);
1450  }
1451  }
1452  }
1453 
1454  foreach ($wordListArray as $key => $val) {
1455  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
1456  // long as the words in $wl are NO longer as 60 chars (the baseword varchar is 60 characters...)
1457  // this is not a problem.
1458  $connection->insert(
1459  'index_words',
1460  [
1461  'wid' => $val['hash'],
1462  'baseword' => $key,
1463  ]
1464  );
1465  }
1466  }
1467  }
1468 
1472  public function submitWords(array $wordList, string $phash): void
1473  {
1475  return;
1476  }
1477  $queryBuilder = $this->connectionPool->getQueryBuilderForTable('index_words');
1478  $result = $queryBuilder->select('wid')
1479  ->from('index_words')
1480  ->where(
1481  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, ‪Connection::PARAM_INT))
1482  )
1483  ->groupBy('wid')
1484  ->executeQuery();
1485 
1486  $stopWords = [];
1487  while ($row = $result->fetchAssociative()) {
1488  $stopWords[$row['wid']] = $row;
1489  }
1490 
1491  $this->connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => $phash]);
1492 
1493  ‪$fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1494  $rows = [];
1495  foreach ($wordList as $val) {
1496  if (isset($stopWords[$val['hash']])) {
1497  continue;
1498  }
1499  $rows[] = [
1500  $phash,
1501  $val['hash'],
1502  (int)$val['count'],
1503  (int)($val['first'] ?? 0),
1504  $this->freqMap($val['count'] / $this->wordcount),
1505  ($val['cmp'] ?? 0) & $this->flagBitMask,
1506  ];
1507  }
1508 
1509  if (!empty($rows)) {
1510  $this->connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, ‪$fields);
1511  }
1512  }
1513 
1521  public function freqMap(float $freq): int
1522  {
1523  $mapFactor = $this->freqMax * 100 * $this->freqRange;
1524  if ($freq <= 1) {
1525  $newFreq = $freq * $mapFactor;
1526  $newFreq = min($newFreq, $this->freqRange);
1527  } else {
1528  $newFreq = $freq / $mapFactor;
1529  }
1530  return (int)$newFreq;
1531  }
1532 
1533  /********************************
1534  *
1535  * Hashing
1536  *
1537  *******************************/
1541  public function setT3Hashes(): void
1542  {
1543  // Set main array:
1544  $hArray = [
1545  'id' => (int)$this->conf['id'],
1546  'type' => (int)$this->conf['type'],
1547  'sys_lang' => (int)$this->conf['sys_language_uid'],
1548  'MP' => (string)$this->conf['MP'],
1549  'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1550  ];
1551  // Set grouping hash (Identifies a "page" combined of id, type, language, mount point and cHash parameters):
1552  $this->hash['phash_grouping'] = md5(serialize($hArray));
1553  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1554  $hArray['gr_list'] = (string)$this->conf['gr_list'];
1555  $this->hash['phash'] = md5(serialize($hArray));
1556  }
1557 
1565  public function setExtHashes(string $file, array $subinfo = []): array
1566  {
1567  // Set main array:
1568  $hash = [];
1569  $hArray = [
1570  'file' => $file,
1571  ];
1572  // Set grouping hash:
1573  $hash['phash_grouping'] = md5(serialize($hArray));
1574  // Add subinfo
1575  $hArray['subinfo'] = $subinfo;
1576  $hash['phash'] = md5(serialize($hArray));
1577  return $hash;
1578  }
1579 
1580  public function log_setTSlogMessage(string $msg, string $logLevel = LogLevel::INFO): void
1581  {
1582  $this->timeTracker->setTSlogMessage($msg, $logLevel);
1583  }
1584 
1593  protected function addSpacesToKeywordList(string $keywordList): string
1594  {
1595  $keywords = ‪GeneralUtility::trimExplode(',', $keywordList);
1596  return ' ' . implode(', ', $keywords) . ' ';
1597  }
1598 
1606  private function removePhashCollisions(array $wordList): array
1607  {
1608  $uniquePhashes = [];
1609  foreach ($wordList as $baseword => $wordData) {
1610  if (in_array($wordData['hash'], $uniquePhashes, true)) {
1611  unset($wordList[$baseword]);
1612  continue;
1613  }
1614  $uniquePhashes[] = $wordData['hash'];
1615  }
1616  return $wordList;
1617  }
1618 
1622  protected function milliseconds(): int
1623  {
1624  return (int)round(microtime(true) * 1000);
1625  }
1626 }
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:27
‪TYPO3\CMS\Core\Database\Connection\PARAM_INT
‪const PARAM_INT
Definition: Connection.php:52
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsString\fromArray
‪static fromArray(array $input)
Definition: IndexingDataAsString.php:35
‪TYPO3\CMS\Core\Utility\PathUtility\isAbsolutePath
‪static isAbsolutePath(string $path)
Definition: PathUtility.php:287
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsString
Definition: IndexingDataAsString.php:24
‪TYPO3\CMS\Core\Html\HtmlParser
Definition: HtmlParser.php:26
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:47
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static getPublicPath()
Definition: Environment.php:187
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\isMysqlFullTextEnabled
‪static isMysqlFullTextEnabled()
Definition: IndexedSearchUtility.php:148
‪TYPO3\CMS\IndexedSearch\Type\IndexStatus
‪IndexStatus
Definition: IndexStatus.php:24
‪$fields
‪$fields
Definition: pages.php:5
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static basename(string $path)
Definition: PathUtility.php:220
‪TYPO3\CMS\Core\Utility\GeneralUtility\writeFile
‪static bool writeFile(string $file, string $content, bool $changePermissions=false)
Definition: GeneralUtility.php:1467
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static string false getUrl(string $url)
Definition: GeneralUtility.php:1442
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsArray\fromArray
‪static fromArray(array $input)
Definition: IndexingDataAsArray.php:37
‪TYPO3\CMS\Core\Http\RequestFactory
Definition: RequestFactory.php:30
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:41
‪TYPO3\CMS\Webhooks\Message\$url
‪identifier readonly UriInterface $url
Definition: LoginErrorOccurredMessage.php:36
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:25
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:41
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:24
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:46
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange(mixed $theInt, int $min, int $max=2000000000, int $defaultValue=0)
Definition: MathUtility.php:34
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsArray
Definition: IndexingDataAsArray.php:26
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:52
‪TYPO3\CMS\Core\TimeTracker\TimeTracker
Definition: TimeTracker.php:34
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility
Definition: IndexedSearchUtility.php:29
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static list< string > trimExplode(string $delim, string $string, bool $removeEmptyValues=false, int $limit=0)
Definition: GeneralUtility.php:822