‪TYPO3CMS  ‪main
Indexer.php
Go to the documentation of this file.
1 <?php
2 
3 declare(strict_types=1);
4 
5 /*
6  * This file is part of the TYPO3 CMS project.
7  *
8  * It is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU General Public License, either version 2
10  * of the License, or any later version.
11  *
12  * For the full copyright and license information, please read the
13  * LICENSE.txt file that was distributed with this source code.
14  *
15  * The TYPO3 project - inspiring people to share!
16  */
17 
19 
20 use Psr\Log\LogLevel;
35 
41 class Indexer
42 {
46  public string $excludeSections = 'script,style';
47 
51  public array $external_parsers = [];
52 
57  public int $tstamp_minAge = 0;
58 
62  public int $maxExternalFiles = 0;
63 
67  public bool $forceIndexing = false;
68 
69  public array $defaultIndexingDataPayload = [
70  'title' => '',
71  'description' => '',
72  'keywords' => '',
73  'body' => '',
74  ];
75 
76  public int $wordcount = 0;
77  public int $externalFileCounter = 0;
78  public array $conf = [];
79 
83  public array $indexerConfig = [];
84 
88  public array $hash = [];
89 
93  public array $file_phash_arr = [];
94 
95  public IndexingDataAsString $indexingDataStringDto;
96 
100  public string $content_md5h = '';
101  public array $internal_log = [];
102  public string $indexExternalUrl_content = '';
103  public int $freqRange = 32000;
104  public float $freqMax = 0.1;
105  public int $flagBitMask;
106 
107  public function __construct(
108  private readonly TimeTracker $timeTracker,
109  private readonly Lexer $lexer,
110  private readonly RequestFactory $requestFactory,
111  private readonly ConnectionPool $connectionPool,
112  ExtensionConfiguration $extensionConfiguration,
113  ) {
114  // Indexer configuration from Extension Manager interface
115  $this->indexerConfig = $extensionConfiguration->get('indexed_search');
116  $this->tstamp_minAge = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
117  $this->maxExternalFiles = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxExternalFiles'] ?? 5), 0, 1000);
118  $this->flagBitMask = ‪MathUtility::forceIntegerInRange((int)($this->indexerConfig['flagBitMask'] ?? 0), 0, 255);
119  }
120 
124  public function init(array $configuration = null): void
125  {
126  if (is_array($configuration)) {
127  $this->conf = $configuration;
128  }
129  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
130  $this->setT3Hashes();
131  // Initialize external document parsers:
132  // Example configuration, see ext_localconf.php of this file!
133  if ($this->conf['index_externals']) {
134  $this->initializeExternalParsers();
135  }
136  }
137 
138  public function initializeExternalParsers(): void
139  {
140  foreach (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
141  $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
142  $this->external_parsers[$extension]->pObj = $this;
143  // Init parser and if it returns FALSE, unset its entry again:
144  if (!$this->external_parsers[$extension]->initParser($extension)) {
145  unset($this->external_parsers[$extension]);
146  }
147  }
148  }
149 
150  /********************************
151  *
152  * Indexing; TYPO3 pages (HTML content)
153  *
154  *******************************/
158  public function indexTypo3PageContent(): void
159  {
160  $indexStatus = $this->getIndexStatus($this->conf['mtime'], $this->hash['phash']);
161  $reindexingRequired = $indexStatus->reindexRequired();
162  $is_grlist = $this->is_grlist_set($this->hash['phash']);
163  if ($reindexingRequired || !$is_grlist || $this->forceIndexing) {
164  // Setting message:
165  if ($this->forceIndexing) {
166  $this->log_setTSlogMessage('Indexing needed, reason: Forced', LogLevel::NOTICE);
167  } elseif ($reindexingRequired) {
168  $this->log_setTSlogMessage('Indexing needed, reason: ' . $indexStatus->reason(), LogLevel::NOTICE);
169  } else {
170  $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', LogLevel::NOTICE);
171  }
172  // Divide into title,keywords,description and body:
173  $this->timeTracker->push('Split content');
174  $this->indexingDataStringDto = $this->splitHTMLContent($this->conf['content']);
175  if ($this->conf['indexedDocTitle']) {
176  $this->indexingDataStringDto->title = $this->conf['indexedDocTitle'];
177  }
178  $this->timeTracker->pull();
179  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
180  $this->content_md5h = md5(implode('', $this->indexingDataStringDto->toArray()));
181  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
182  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
183  // This will also prevent pages from being indexed if a fe_users has logged in, and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
184  $checkCHash = $this->checkContentHash();
185  if (!is_array($checkCHash) || $reindexingRequired) {
186  $Pstart = $this->milliseconds();
187  $this->timeTracker->push('Converting entities of content');
188  $this->charsetEntity2utf8($this->indexingDataStringDto);
189  $this->timeTracker->pull();
190 
191  // Splitting words
192  $this->timeTracker->push('Extract words from content');
193  $splitInWords = $this->processWordsInArrays($this->indexingDataStringDto);
194  $this->timeTracker->pull();
195 
196  // Analyze the indexed words.
197  $this->timeTracker->push('Analyze the extracted words');
198  $indexArr = $this->indexAnalyze($splitInWords);
199  $this->timeTracker->pull();
200 
201  // Submitting page (phash) record
202  $this->timeTracker->push('Submitting page');
203  $this->submitPage();
204  $this->timeTracker->pull();
205 
206  // Check words and submit to word list if not there
207  $this->timeTracker->push('Check word list and submit words');
209  $indexArr = $this->removePhashCollisions($indexArr);
210  $this->checkWordList($indexArr);
211  $this->submitWords($indexArr, $this->hash['phash']);
212  }
213  $this->timeTracker->pull();
214 
215  // Set parse time
216  $this->updateParsetime($this->hash['phash'], $this->milliseconds() - $Pstart);
217 
218  // Checking external files if configured for.
219  if ($this->conf['index_externals']) {
220  $this->timeTracker->push('Checking external files', '');
221  $this->extractLinks($this->conf['content']);
222  $this->timeTracker->pull();
223  }
224  } else {
225  // Update the timestamp
226  $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
227  $this->updateSetId($this->hash['phash']);
228 
229  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
230  $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
231  $this->updateRootline();
232  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
233  }
234  } else {
235  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $indexStatus->reason());
236  }
237  }
238 
244  public function splitHTMLContent(string $content): IndexingDataAsString
245  {
246  $indexingDataDto = ‪IndexingDataAsString::fromArray($this->defaultIndexingDataPayload);
247  $indexingDataDto->body = stristr($content, '<body') ?: '';
248  $headPart = substr($content, 0, -strlen($indexingDataDto->body));
249  // get title
250  $this->embracingTags($headPart, 'TITLE', $indexingDataDto->title, $dummy2, $dummy);
251  $titleParts = explode(':', $indexingDataDto->title, 2);
252  $indexingDataDto->title = trim($titleParts[1] ?? $titleParts[0]);
253  // get keywords and description meta tags
254  if ($this->conf['index_metatags']) {
255  $meta = [];
256  $i = 0;
257  while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
258  $i++;
259  }
260  // @todo The code below stops at first unset tag. Is that correct?
261  for ($i = 0; isset($meta[$i]); $i++) {
262  // decode HTML entities, meta tag content needs to be encoded later
263  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i], true);
264  if (stripos(($meta[$i]['name'] ?? ''), 'keywords') !== false) {
265  $indexingDataDto->keywords .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
266  }
267  if (stripos(($meta[$i]['name'] ?? ''), 'description') !== false) {
268  $indexingDataDto->description .= ',' . $meta[$i]['content'];
269  }
270  }
271  }
272  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
273  $this->typoSearchTags($indexingDataDto->body);
274  // Get rid of unwanted sections (i.e. scripting and style stuff) in body
275  $tagList = explode(',', $this->excludeSections);
276  foreach ($tagList as $tag) {
277  while ($this->embracingTags($indexingDataDto->body, $tag, $dummy, $indexingDataDto->body, $dummy2)) {
278  }
279  }
280  // remove tags, but first make sure we don't concatenate words by doing it
281  $indexingDataDto->body = str_replace('<', ' <', $indexingDataDto->body);
282  $indexingDataDto->body = trim(strip_tags($indexingDataDto->body));
283  $indexingDataDto->keywords = trim($indexingDataDto->keywords);
284  $indexingDataDto->description = trim($indexingDataDto->description);
285 
286  return $indexingDataDto;
287  }
288 
292  public function getHTMLcharset(string $content): string
293  {
294  // @todo: Use \DOMDocument and DOMXpath
295  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)
296  && preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)
297  ) {
298  return $reg2[1];
299  }
300 
301  return '';
302  }
303 
307  public function convertHTMLToUtf8(string $content, string $charset = ''): string
308  {
309  // Find charset
310  $charset = $charset ?: $this->getHTMLcharset($content);
311  $charset = strtolower(trim($charset));
312  // Convert charset
313  if ($charset && $charset !== 'utf-8') {
314  $content = mb_convert_encoding($content, 'utf-8', $charset);
315  }
316  // Convert entities, assuming document is now UTF-8
317  return html_entity_decode($content);
318  }
319 
331  public function embracingTags(string $string, string $tagName, ?string &$tagContent, ?string &$stringAfter, ?string &$paramList): bool
332  {
333  $endTag = '</' . $tagName . '>';
334  $startTag = '<' . $tagName;
335  // stristr used because we want a case-insensitive search for the tag.
336  $isTagInText = stristr($string, $startTag);
337  // if the tag was not found, return FALSE
338  if (!$isTagInText) {
339  return false;
340  }
341  [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
342  $afterTagInText = stristr($isTagInText, $endTag);
343  if ($afterTagInText) {
344  $stringBefore = substr($string, 0, (int)stripos($string, $startTag));
345  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
346  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
347  } else {
348  $tagContent = '';
349  $stringAfter = $isTagInText;
350  }
351  return true;
352  }
353 
360  public function typoSearchTags(string &$body): bool
361  {
362  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
363  $expBody = $expBody ?: [];
364  if (count($expBody) > 1) {
365  $body = '';
366  $prev = '';
367  foreach ($expBody as $val) {
368  $part = explode('-->', $val, 2);
369  if (trim($part[0]) === 'begin') {
370  $body .= $part[1];
371  $prev = '';
372  } elseif (trim($part[0]) === 'end') {
373  $body .= $prev;
374  } else {
375  $prev = $val;
376  }
377  }
378  return true;
379  }
380  return false;
381  }
382 
386  public function extractLinks(string $content): void
387  {
388  // Get links:
389  $list = $this->extractHyperLinks($content);
390  // Traverse links:
391  foreach ($list as $linkInfo) {
392  // Decode entities:
393  if ($linkInfo['localPath']) {
394  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
395  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
396  } else {
397  $linkSource = htmlspecialchars_decode($linkInfo['href']);
398  }
399  // Parse URL:
400  $qParts = parse_url($linkSource);
401  // Check for jumpurl (TYPO3 specific thing...)
402  if (($qParts['query'] ?? false) && str_contains($qParts['query'], 'jumpurl=')) {
403  parse_str($qParts['query'], $getP);
404  $linkSource = $getP['jumpurl'];
405  $qParts = parse_url($linkSource);
406  }
407  if (!$linkInfo['localPath'] && ($qParts['scheme'] ?? false)) {
408  if ($this->indexerConfig['indexExternalURLs']) {
409  // Index external URL (http or otherwise)
410  $this->indexExternalUrl($linkSource);
411  }
412  } elseif (!($qParts['query'] ?? false)) {
413  $linkSource = urldecode($linkSource);
414  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
415  $localFile = $linkSource;
416  } else {
417  $localFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $linkSource);
418  }
419  if ($localFile && @is_file($localFile)) {
420  // Index local file:
421  if ($linkInfo['localPath']) {
422  $fI = pathinfo($linkSource);
423  $ext = strtolower($fI['extension']);
424  $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
425  } else {
426  $this->indexRegularDocument($linkSource);
427  }
428  }
429  }
430  }
431  }
432 
438  public function extractHyperLinks(string $html): array
439  {
440  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
441  $htmlParts = $htmlParser->splitTags('a', $html);
442  $hyperLinksData = [];
443  foreach ($htmlParts as $index => $tagData) {
444  if ($index % 2 !== 0) {
445  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
446  $firstTagName = $htmlParser->getFirstTagName($tagData);
447  if (strtolower($firstTagName) === 'a') {
448  if (!empty($tagAttributes[0]['href']) && !str_starts_with($tagAttributes[0]['href'], '#')) {
449  $hyperLinksData[] = [
450  'tag' => $tagData,
451  'href' => $tagAttributes[0]['href'],
452  'localPath' => $this->createLocalPath(urldecode($tagAttributes[0]['href'])),
453  ];
454  }
455  }
456  }
457  }
458  return $hyperLinksData;
459  }
460 
464  public function extractBaseHref(string $html): string
465  {
466  $href = '';
467  $htmlParser = GeneralUtility::makeInstance(HtmlParser::class);
468  $htmlParts = $htmlParser->splitTags('base', $html);
469  foreach ($htmlParts as $index => $tagData) {
470  if ($index % 2 !== 0) {
471  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
472  $firstTagName = $htmlParser->getFirstTagName($tagData);
473  if (strtolower($firstTagName) === 'base') {
474  $href = $tagAttributes[0]['href'];
475  if ($href) {
476  break;
477  }
478  }
479  }
480  }
481  return $href;
482  }
483 
484  /******************************************
485  *
486  * Indexing; external URL
487  *
488  ******************************************/
494  public function indexExternalUrl(string $externalUrl): void
495  {
496  // Get headers:
497  $urlHeaders = $this->getUrlHeaders($externalUrl);
498  if (is_array($urlHeaders) && stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
499  $content = ($this->indexExternalUrl_content = ‪GeneralUtility::getUrl($externalUrl));
500  if ((string)$content !== '') {
501  // Create temporary file:
502  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
503  ‪GeneralUtility::writeFile($tmpFile, $content);
504  // Index that file:
505  $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
506  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
507  unlink($tmpFile);
508  }
509  }
510  }
511 
518  public function getUrlHeaders(string ‪$url): array|false
519  {
520  try {
521  $response = $this->requestFactory->request(‪$url, 'HEAD');
522  $headers = $response->getHeaders();
523  $retVal = [];
524  foreach ($headers as $key => $value) {
525  $retVal[$key] = implode('', $value);
526  }
527  return $retVal;
528  } catch (\Exception $e) {
529  // fail silently if the HTTP request failed
530  return false;
531  }
532  }
533 
539  protected function createLocalPath(string $sourcePath): string
540  {
541  $localPath = $this->createLocalPathUsingAbsRefPrefix($sourcePath);
542  if ($localPath !== '') {
543  return $localPath;
544  }
545  $localPath = $this->createLocalPathUsingDomainURL($sourcePath);
546  if ($localPath !== '') {
547  return $localPath;
548  }
549  $localPath = $this->createLocalPathFromAbsoluteURL($sourcePath);
550  if ($localPath !== '') {
551  return $localPath;
552  }
553  return $this->createLocalPathFromRelativeURL($sourcePath);
554  }
555 
559  protected function createLocalPathUsingDomainURL(string $sourcePath): string
560  {
561  $localPath = '';
562  $baseURL = ‪$GLOBALS['TYPO3_REQUEST']->getAttribute('normalizedParams')->getSiteUrl();
563  $baseURLLength = strlen($baseURL);
564  if (str_starts_with($sourcePath, $baseURL)) {
565  $sourcePath = substr($sourcePath, $baseURLLength);
566  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
567  if (!self::isAllowedLocalFile($localPath)) {
568  $localPath = '';
569  }
570  }
571  return $localPath;
572  }
573 
578  protected function createLocalPathUsingAbsRefPrefix(string $sourcePath): string
579  {
580  $localPath = '';
581  $request = ‪$GLOBALS['TYPO3_REQUEST'];
582  $frontendTypoScriptConfigArray = $request->getAttribute('frontend.typoscript')?->getConfigArray();
583  if ($frontendTypoScriptConfigArray) {
584  $absRefPrefix = $frontendTypoScriptConfigArray['absRefPrefix'] ?? '';
585  $absRefPrefixLength = strlen($absRefPrefix);
586  if ($absRefPrefixLength > 0 && str_starts_with($sourcePath, $absRefPrefix)) {
587  $sourcePath = substr($sourcePath, $absRefPrefixLength);
588  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
589  if (!self::isAllowedLocalFile($localPath)) {
590  $localPath = '';
591  }
592  }
593  }
594  return $localPath;
595  }
596 
600  protected function createLocalPathFromAbsoluteURL(string $sourcePath): string
601  {
602  $localPath = '';
603  if (str_starts_with($sourcePath, '/')) {
604  $sourcePath = substr($sourcePath, 1);
605  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
606  if (!self::isAllowedLocalFile($localPath)) {
607  $localPath = '';
608  }
609  }
610  return $localPath;
611  }
612 
616  protected function createLocalPathFromRelativeURL(string $sourcePath): string
617  {
618  $localPath = '';
619  if (self::isRelativeURL($sourcePath)) {
620  $localPath = ‪Environment::getPublicPath() . '/' . $sourcePath;
621  if (!self::isAllowedLocalFile($localPath)) {
622  $localPath = '';
623  }
624  }
625  return $localPath;
626  }
627 
631  protected static function isRelativeURL(string ‪$url): bool
632  {
633  $urlParts = @parse_url(‪$url);
634  return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && !str_starts_with(($urlParts['path'][0] ?? ''), '/');
635  }
636 
640  protected static function isAllowedLocalFile(string $filePath): bool
641  {
642  $filePath = GeneralUtility::resolveBackPath($filePath);
643  $insideWebPath = str_starts_with($filePath, ‪Environment::getPublicPath());
644  $isFile = is_file($filePath);
645  return $insideWebPath && $isFile;
646  }
647 
648  /******************************************
649  *
650  * Indexing; external files (PDF, DOC, etc)
651  *
652  ******************************************/
661  public function indexRegularDocument(string $file, bool $force = false, string $contentTmpFile = '', string $altExtension = ''): void
662  {
663  $fI = pathinfo($file);
664  $ext = $altExtension ?: strtolower($fI['extension']);
665  // Create abs-path
666  if (!$contentTmpFile) {
667  if (!‪PathUtility::isAbsolutePath($file)) {
668  // Relative, prepend public web path:
669  $absFile = GeneralUtility::getFileAbsFileName(‪Environment::getPublicPath() . '/' . $file);
670  } else {
671  // Absolute, pass-through:
672  $absFile = $file;
673  }
674  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
675  } else {
676  $absFile = $contentTmpFile;
677  }
678  // Indexing the document:
679  if ($absFile && @is_file($absFile)) {
680  if ($this->external_parsers[$ext] ?? false) {
681  $fileInfo = stat($absFile);
682  $cParts = $this->fileContentParts($ext, $absFile);
683  foreach ($cParts as $cPKey) {
684  $this->internal_log = [];
685  $this->timeTracker->push('Index: ' . str_replace('.', '_', ‪PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''));
686  $Pstart = $this->milliseconds();
687  $subinfo = ['key' => $cPKey];
688  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
689  $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
690  $indexStatus = $this->getIndexStatus($fileInfo['mtime'], $phash_arr['phash']);
691  $reindexingRequired = $indexStatus->reindexRequired();
692  if ($reindexingRequired || $force) {
693  if ($reindexingRequired) {
694  $this->log_setTSlogMessage('Indexing needed, reason: ' . $indexStatus->reason(), LogLevel::NOTICE);
695  } else {
696  $this->log_setTSlogMessage('Indexing forced by flag', LogLevel::NOTICE);
697  }
698  // Check external file counter:
699  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
700  // Divide into title,keywords,description and body:
701  $this->timeTracker->push('Split content');
702  $indexingDataDtoAsString = $this->readFileContent($ext, $absFile, $cPKey);
703  $this->timeTracker->pull();
704  if ($indexingDataDtoAsString !== null) {
705  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
706  $content_md5h = md5(implode('', $indexingDataDtoAsString->toArray()));
707  if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
708  // Increment counter:
709  $this->externalFileCounter++;
710 
711  // Splitting words
712  $this->timeTracker->push('Extract words from content');
713  $splitInWords = $this->processWordsInArrays($indexingDataDtoAsString);
714  $this->timeTracker->pull();
715 
716  // Analyze the indexed words.
717  $this->timeTracker->push('Analyze the extracted words');
718  $indexArr = $this->indexAnalyze($splitInWords);
719  $this->timeTracker->pull();
720 
721  // Submitting page (phash) record
722  $this->timeTracker->push('Submitting page');
723 
724  // Unfortunately the original creation time cannot be determined, therefore we fall back to the modification date
725  $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $indexingDataDtoAsString);
726  $this->timeTracker->pull();
727 
728  // Check words and submit to word list if not there
729  $this->timeTracker->push('Check word list and submit words');
731  $indexArr = $this->removePhashCollisions($indexArr);
732  $this->checkWordList($indexArr);
733  $this->submitWords($indexArr, $phash_arr['phash']);
734  }
735  $this->timeTracker->pull();
736 
737  // Set parsetime
738  $this->updateParsetime($phash_arr['phash'], $this->milliseconds() - $Pstart);
739  } else {
740  // Update the timestamp
741  $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
742  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
743  }
744  } else {
745  $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
746  }
747  } else {
748  $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
749  }
750  } else {
751  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $indexStatus->reason());
752  }
753  // Checking and setting sections:
754  $this->submitFile_section($phash_arr['phash']);
755  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
756  $this->timeTracker->pull();
757  }
758  } else {
759  $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
760  }
761  } else {
762  $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
763  }
764  }
765 
774  public function readFileContent(string $fileExtension, string $absoluteFileName, string|int $sectionPointer): ?IndexingDataAsString
775  {
776  $indexingDataDto = null;
777  // Consult relevant external document parser
778  if (is_object($this->external_parsers[$fileExtension])) {
779  $indexingDataDto = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
780  }
781 
782  if (is_array($indexingDataDto)) {
783  trigger_error(
784  sprintf(
785  'The method %s returns an array, which is deprecated and will stop working in TYPO3 v14.0. Return an instance of %s instead.',
786  get_class($this->external_parsers[$fileExtension]) . '::readFileContent()',
787  IndexingDataAsString::class
788  ),
789  E_USER_DEPRECATED
790  );
791  $indexingDataDto = ‪IndexingDataAsString::fromArray($indexingDataDto);
792  }
793 
794  if ($indexingDataDto instanceof IndexingDataAsString) {
795  return $indexingDataDto;
796  }
797 
798  return null;
799  }
800 
808  public function fileContentParts(string $ext, string $absFile): array
809  {
810  $cParts = [0];
811  // Consult relevant external document parser:
812  if (is_object($this->external_parsers[$ext])) {
813  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
814  }
815  return $cParts;
816  }
817 
821  public function splitRegularContent(string $content): IndexingDataAsString
822  {
823  $indexingDataDto = ‪IndexingDataAsString::fromArray($this->defaultIndexingDataPayload);
824  $indexingDataDto->body = $content;
825 
826  return $indexingDataDto;
827  }
828 
829  /**********************************
830  *
831  * Analysing content, Extracting words
832  *
833  **********************************/
837  public function charsetEntity2utf8(IndexingDataAsString $indexingDataDto): void
838  {
839  // Convert charset if necessary
840  foreach ($indexingDataDto->toArray() as $key => $value) {
841  if ((string)$value !== '') {
842  // decode all numeric / html-entities in the string to real characters:
843  $indexingDataDto->{$key} = html_entity_decode($value);
844  }
845  }
846  }
847 
851  public function processWordsInArrays(IndexingDataAsString $input): IndexingDataAsArray
852  {
853  $contentArr = [];
854 
855  // split all parts to words
856  foreach ($input->toArray() as $key => $value) {
857  $contentArr[$key] = $this->lexer->split2Words($value);
858  }
859 
860  $indexingDataDto = ‪IndexingDataAsArray::fromArray($contentArr);
861 
862  // For title, keywords, and description we don't want duplicates
863  $indexingDataDto->title = array_unique($indexingDataDto->title);
864  $indexingDataDto->keywords = array_unique($indexingDataDto->keywords);
865  $indexingDataDto->description = array_unique($indexingDataDto->description);
866 
867  return $indexingDataDto;
868  }
869 
873  public function bodyDescription(IndexingDataAsString $indexingDataDto): string
874  {
875  $bodyDescription = '';
876  // Setting description
877  $maxL = ‪MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
878  if ($maxL) {
879  $bodyDescription = preg_replace('/\s+/u', ' ', $indexingDataDto->body);
880  // Shorten the string. If the database has the wrong character set,
881  // the string is probably truncated again.
882  $bodyDescription = \mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
883  }
884  return $bodyDescription;
885  }
886 
892  public function indexAnalyze(IndexingDataAsArray $indexingDataDto): array
893  {
894  $indexArr = [];
895  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->title, 7);
896  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->keywords, 6);
897  $this->analyzeHeaderinfo($indexArr, $indexingDataDto->description, 5);
898  $this->analyzeBody($indexArr, $indexingDataDto);
899  return $indexArr;
900  }
901 
909  public function analyzeHeaderinfo(array &$retArr, array $content, int $offset): void
910  {
911  foreach ($content as $val) {
912  $val = mb_substr($val, 0, 60);
913  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
914  if (!isset($retArr[$val])) {
915  // Word ID (wid)
916  $retArr[$val]['hash'] = md5($val);
917  }
918  // Priority used for flagBitMask feature (see extension configuration)
919  $retArr[$val]['cmp'] = ($retArr[$val]['cmp'] ?? 0) | 2 ** $offset;
920  if (!($retArr[$val]['count'] ?? false)) {
921  $retArr[$val]['count'] = 0;
922  }
923 
924  // Increase number of occurrences
925  $retArr[$val]['count']++;
926  $this->wordcount++;
927  }
928  }
929 
935  public function analyzeBody(array &$retArr, IndexingDataAsArray $indexingDataDto): void
936  {
937  foreach ($indexingDataDto->body as $key => $val) {
938  $val = substr($val, 0, 60);
939  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
940  if (!isset($retArr[$val])) {
941  // First occurrence (used for ranking results)
942  $retArr[$val]['first'] = $key;
943  // Word ID (wid)
944  $retArr[$val]['hash'] = md5($val);
945  }
946  if (!($retArr[$val]['count'] ?? false)) {
947  $retArr[$val]['count'] = 0;
948  }
949 
950  // Increase number of occurrences
951  $retArr[$val]['count']++;
952  $this->wordcount++;
953  }
954  }
955 
956  /********************************
957  *
958  * SQL; TYPO3 Pages
959  *
960  *******************************/
964  public function submitPage(): void
965  {
966  // Remove any current data for this phash:
967  $this->removeOldIndexedPages($this->hash['phash']);
968  // setting new phash_row
969  ‪$fields = [
970  'phash' => $this->hash['phash'],
971  'phash_grouping' => $this->hash['phash_grouping'],
972  'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
973  'contentHash' => $this->content_md5h,
974  'data_page_id' => $this->conf['id'],
975  'data_page_type' => $this->conf['type'],
976  'data_page_mp' => $this->conf['MP'],
977  'gr_list' => $this->conf['gr_list'],
978  'item_type' => 0,
979  // TYPO3 page
980  'item_title' => $this->indexingDataStringDto->title,
981  'item_description' => $this->bodyDescription($this->indexingDataStringDto),
982  'item_mtime' => (int)$this->conf['mtime'],
983  'item_size' => strlen($this->conf['content']),
984  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
985  'crdate' => ‪$GLOBALS['EXEC_TIME'],
986  'item_crdate' => $this->conf['crdate'],
987  // Creation date of page
988  'sys_language_uid' => $this->conf['sys_language_uid'],
989  // Sys language uid of the page. Should reflect which language it DOES actually display!
990  'externalUrl' => 0,
991  'recordUid' => (int)$this->conf['recordUid'],
992  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
993  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
994  ];
995  $connection = $this->connectionPool->getConnectionForTable('index_phash');
996  $connection->insert(
997  'index_phash',
998  ‪$fields
999  );
1000  // PROCESSING index_section
1001  $this->submit_section($this->hash['phash'], $this->hash['phash']);
1002  // PROCESSING index_grlist
1003  $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1004  // PROCESSING index_fulltext
1005  ‪$fields = [
1006  'phash' => $this->hash['phash'],
1007  'fulltextdata' => implode(' ', $this->indexingDataStringDto->toArray()),
1008  ];
1009  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1010  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1011  }
1012  $connection = $this->connectionPool->getConnectionForTable('index_fulltext');
1013  $connection->insert('index_fulltext', ‪$fields);
1014  // PROCESSING index_debug
1015  if ($this->indexerConfig['debugMode'] ?? false) {
1016  ‪$fields = [
1017  'phash' => $this->hash['phash'],
1018  'debuginfo' => json_encode([
1019  'external_parsers initialized' => array_keys($this->external_parsers),
1020  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1021  'contentParts' => array_merge($this->indexingDataStringDto->toArray(), ['body' => substr($this->indexingDataStringDto->body, 0, 1000)]),
1022  'logs' => $this->internal_log,
1023  ], JSON_THROW_ON_ERROR),
1024  ];
1025  $connection = $this->connectionPool->getConnectionForTable('index_debug');
1026  $connection->insert('index_debug', ‪$fields);
1027  }
1028  }
1029 
1036  public function submit_grlist(string $hash, string $phash_x): void
1037  {
1038  // Setting the gr_list record
1039  ‪$fields = [
1040  'phash' => $hash,
1041  'phash_x' => $phash_x,
1042  'hash_gr_list' => md5($this->conf['gr_list']),
1043  'gr_list' => $this->conf['gr_list'],
1044  ];
1045  $connection = $this->connectionPool->getConnectionForTable('index_grlist');
1046  $connection->insert('index_grlist', ‪$fields);
1047  }
1048 
1056  public function submit_section(string $hash, string $hash_t3): void
1057  {
1058  ‪$fields = [
1059  'phash' => $hash,
1060  'phash_t3' => $hash_t3,
1061  'page_id' => (int)$this->conf['id'],
1062  ];
1063  $this->getRootLineFields(‪$fields);
1064  $connection = $this->connectionPool->getConnectionForTable('index_section');
1065  $connection->insert('index_section', ‪$fields);
1066  }
1067 
1073  public function removeOldIndexedPages(string $phash): void
1074  {
1075  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1076  // there can be nothing else than 1-1 relations here.
1077  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1078  foreach ($tableArray as $table) {
1079  $this->connectionPool->getConnectionForTable($table)->delete($table, ['phash' => $phash]);
1080  }
1081 
1082  // Removing all index_section records with hash_t3 set to this hash (this includes such
1083  // records set for external media on the page as well!). The re-insert of these records
1084  // are done in indexRegularDocument($file).
1085  $this->connectionPool->getConnectionForTable('index_section')->delete('index_section', ['phash_t3' => $phash]);
1086  }
1087 
1088  /********************************
1089  *
1090  * SQL; External media
1091  *
1092  *******************************/
1105  public function submitFilePage(array $hash, string $file, array $subinfo, string $ext, int $mtime, int $ctime, int $size, string $content_md5h, IndexingDataAsString $indexingDataDto): void
1106  {
1107  // Find item Type:
1108  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1109  $storeItemType = $storeItemType ?: $ext;
1110  // Remove any current data for this phash:
1111  $this->removeOldIndexedFiles($hash['phash']);
1112  // Split filename:
1113  $fileParts = parse_url($file);
1114  // Setting new
1115  ‪$fields = [
1116  'phash' => $hash['phash'],
1117  'phash_grouping' => $hash['phash_grouping'],
1118  'static_page_arguments' => json_encode($subinfo),
1119  'contentHash' => $content_md5h,
1120  'data_filename' => $file,
1121  'item_type' => $storeItemType,
1122  'item_title' => trim($indexingDataDto->title) ?: ‪PathUtility::basename($file),
1123  'item_description' => $this->bodyDescription($indexingDataDto),
1124  'item_mtime' => $mtime,
1125  'item_size' => $size,
1126  'item_crdate' => $ctime,
1127  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1128  'crdate' => ‪$GLOBALS['EXEC_TIME'],
1129  'gr_list' => $this->conf['gr_list'],
1130  'externalUrl' => ($fileParts['scheme'] ?? false) ? 1 : 0,
1131  'recordUid' => (int)$this->conf['recordUid'],
1132  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1133  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1134  'sys_language_uid' => (int)$this->conf['sys_language_uid'],
1135  ];
1136  $connection = $this->connectionPool->getConnectionForTable('index_phash');
1137  $connection->insert(
1138  'index_phash',
1139  ‪$fields
1140  );
1141  // PROCESSING index_fulltext
1142  ‪$fields = [
1143  'phash' => $hash['phash'],
1144  'fulltextdata' => implode(' ', $indexingDataDto->toArray()),
1145  ];
1146  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1147  ‪$fields['fulltextdata'] = substr(‪$fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1148  }
1149  $connection = $this->connectionPool->getConnectionForTable('index_fulltext');
1150  $connection->insert('index_fulltext', ‪$fields);
1151  // PROCESSING index_debug
1152  if ($this->indexerConfig['debugMode'] ?? false) {
1153  ‪$fields = [
1154  'phash' => $hash['phash'],
1155  'debuginfo' => json_encode([
1156  'static_page_arguments' => $subinfo,
1157  'contentParts' => array_merge($indexingDataDto->toArray(), ['body' => substr($indexingDataDto->body, 0, 1000)]),
1158  'logs' => $this->internal_log,
1159  ], JSON_THROW_ON_ERROR),
1160  ];
1161  $connection = $this->connectionPool->getConnectionForTable('index_debug');
1162  $connection->insert('index_debug', ‪$fields);
1163  }
1164  }
1165 
1171  public function submitFile_section(string $hash): void
1172  {
1173  // Testing if there is already a section
1174  $queryBuilder = $this->connectionPool->getQueryBuilderForTable('index_section');
1175  $count = (int)$queryBuilder->count('phash')
1176  ->from('index_section')
1177  ->where(
1178  $queryBuilder->expr()->eq(
1179  'phash',
1180  $queryBuilder->createNamedParameter($hash)
1181  ),
1182  $queryBuilder->expr()->eq(
1183  'page_id',
1184  $queryBuilder->createNamedParameter($this->conf['id'], ‪Connection::PARAM_INT)
1185  )
1186  )
1187  ->executeQuery()
1188  ->fetchOne();
1189 
1190  if ($count === 0) {
1191  $this->submit_section($hash, $this->hash['phash']);
1192  }
1193  }
1194 
1200  public function removeOldIndexedFiles(string $phash): void
1201  {
1202  // Removing old registrations for tables.
1203  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1204  foreach ($tableArray as $table) {
1205  $this->connectionPool->getConnectionForTable($table)->delete($table, ['phash' => $phash]);
1206  }
1207  }
1208 
1215  public function getIndexStatus(int $mtime, string $phash): ‪IndexStatus
1216  {
1217  $row = $this->connectionPool->getConnectionForTable('index_phash')
1218  ->select(
1219  ['item_mtime', 'tstamp'],
1220  'index_phash',
1221  ['phash' => $phash],
1222  [],
1223  [],
1224  1
1225  )
1226  ->fetchAssociative();
1227 
1228  if (empty($row)) {
1229  // Page has never been indexed (is not represented in the index_phash table).
1230  return IndexStatus::NEW_DOCUMENT;
1231  }
1232 
1233  if (!$this->tstamp_minAge || ‪$GLOBALS['EXEC_TIME'] > $row['tstamp'] + $this->tstamp_minAge) {
1234  // if minAge is not set or if minAge is exceeded, consider at mtime
1235  if ($mtime) {
1236  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1237  if ((int)$row['item_mtime'] !== $mtime) {
1238  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1239  // The minimum age has exceeded and mtime was set and the mtime was different, so the page was indexed.
1240  return IndexStatus::MODIFICATION_TIME_DIFFERS;
1241  }
1242 
1243  // mtime matched the document, so no changes detected and no content updated
1244  $this->updateTstamp($phash);
1245  $this->log_setTSlogMessage('mtime matched, timestamp updated.', LogLevel::NOTICE);
1246  return IndexStatus::MTIME_MATCHED;
1247  }
1248 
1249  // The minimum age has exceeded, but mtime was not set, so the page was indexed.
1250  return IndexStatus::MODIFICATION_TIME_NOT_SET;
1251  }
1252 
1253  // The minimum age was not exceeded
1254  return IndexStatus::MINIMUM_AGE_NOT_EXCEEDED;
1255  }
1256 
1262  public function checkContentHash(): array|true
1263  {
1264  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1265  $row = $this->connectionPool->getConnectionForTable('index_phash')
1266  ->select(
1267  ['phash'],
1268  'index_phash',
1269  [
1270  'phash_grouping' => $this->hash['phash_grouping'],
1271  'contentHash' => $this->content_md5h,
1272  ],
1273  [],
1274  [],
1275  1
1276  )
1277  ->fetchAssociative();
1278 
1279  return $row ?: true;
1280  }
1281 
1289  public function checkExternalDocContentHash(string $hashGr, string $content_md5h): bool
1290  {
1291  $count = $this->connectionPool->getConnectionForTable('index_phash')
1292  ->count(
1293  '*',
1294  'index_phash',
1295  [
1296  'phash_grouping' => $hashGr,
1297  'contentHash' => $content_md5h,
1298  ]
1299  );
1300  return $count === 0;
1301  }
1302 
1306  public function is_grlist_set(string $phash_x): bool
1307  {
1308  $count = $this->connectionPool->getConnectionForTable('index_grlist')
1309  ->count(
1310  'phash_x',
1311  'index_grlist',
1312  ['phash_x' => $phash_x]
1313  );
1314  return $count > 0;
1315  }
1316 
1323  public function update_grlist(string $phash, string $phash_x): void
1324  {
1325  $count = $this->connectionPool->getConnectionForTable('index_grlist')
1326  ->count(
1327  'phash',
1328  'index_grlist',
1329  [
1330  'phash' => $phash,
1331  'hash_gr_list' => md5($this->conf['gr_list']),
1332  ]
1333  );
1334 
1335  if ($count === 0) {
1336  $this->submit_grlist($phash, $phash_x);
1337  $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', LogLevel::NOTICE);
1338  }
1339  }
1340 
1344  public function updateTstamp(string $phash, int $mtime = 0): void
1345  {
1346  $updateFields = [
1347  'tstamp' => ‪$GLOBALS['EXEC_TIME'],
1348  ];
1349 
1350  if ($mtime) {
1351  $updateFields['item_mtime'] = $mtime;
1352  }
1353 
1354  $this->connectionPool->getConnectionForTable('index_phash')
1355  ->update(
1356  'index_phash',
1357  $updateFields,
1358  [
1359  'phash' => $phash,
1360  ]
1361  );
1362  }
1363 
1367  public function updateSetId(string $phash): void
1368  {
1369  $this->connectionPool->getConnectionForTable('index_phash')
1370  ->update(
1371  'index_phash',
1372  [
1373  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1374  ],
1375  [
1376  'phash' => $phash,
1377  ]
1378  );
1379  }
1380 
1384  public function updateParsetime(string $phash, int $parsetime): void
1385  {
1386  $this->connectionPool->getConnectionForTable('index_phash')
1387  ->update(
1388  'index_phash',
1389  [
1390  'parsetime' => $parsetime,
1391  ],
1392  [
1393  'phash' => $phash,
1394  ]
1395  );
1396  }
1397 
1401  public function updateRootline(): void
1402  {
1403  $updateFields = [];
1404  $this->getRootLineFields($updateFields);
1405 
1406  $this->connectionPool->getConnectionForTable('index_section')
1407  ->update(
1408  'index_section',
1409  $updateFields,
1410  [
1411  'page_id' => (int)$this->conf['id'],
1412  ]
1413  );
1414  }
1415 
1422  public function getRootLineFields(array &$fieldArray): void
1423  {
1424  $fieldArray['rl0'] = (int)($this->conf['rootline_uids'][0] ?? 0);
1425  $fieldArray['rl1'] = (int)($this->conf['rootline_uids'][1] ?? 0);
1426  $fieldArray['rl2'] = (int)($this->conf['rootline_uids'][2] ?? 0);
1427  }
1428 
1429  /********************************
1430  *
1431  * SQL; Submitting words
1432  *
1433  *******************************/
1439  public function checkWordList(array $wordListArray): void
1440  {
1441  if ($wordListArray === [] || ‪IndexedSearchUtility::isMysqlFullTextEnabled()) {
1442  return;
1443  }
1444 
1445  $wordListArrayCount = count($wordListArray);
1446  $phashArray = array_column($wordListArray, 'hash');
1447 
1448  $queryBuilder = $this->connectionPool->getQueryBuilderForTable('index_words');
1449  $count = (int)$queryBuilder->count('baseword')
1450  ->from('index_words')
1451  ->where(
1452  $queryBuilder->expr()->in(
1453  'wid',
1454  $queryBuilder->quoteArrayBasedValueListToStringList($phashArray)
1455  )
1456  )
1457  ->executeQuery()
1458  ->fetchOne();
1459 
1460  if ($count !== $wordListArrayCount) {
1461  $connection = $this->connectionPool->getConnectionForTable('index_words');
1462  $queryBuilder = $connection->createQueryBuilder();
1463 
1464  $result = $queryBuilder->select('wid')
1465  ->from('index_words')
1466  ->where(
1467  $queryBuilder->expr()->in(
1468  'wid',
1469  $queryBuilder->quoteArrayBasedValueListToStringList($phashArray)
1470  )
1471  )
1472  ->executeQuery();
1473 
1474  $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), LogLevel::NOTICE);
1475  while ($row = $result->fetchAssociative()) {
1476  foreach ($wordListArray as $baseword => $wordData) {
1477  if ($wordData['hash'] === $row['wid']) {
1478  unset($wordListArray[$baseword]);
1479  }
1480  }
1481  }
1482 
1483  foreach ($wordListArray as $key => $val) {
1484  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
1485  // long as the words in $wl are NO longer as 60 chars (the baseword varchar is 60 characters...)
1486  // this is not a problem.
1487  $connection->insert(
1488  'index_words',
1489  [
1490  'wid' => $val['hash'],
1491  'baseword' => $key,
1492  ]
1493  );
1494  }
1495  }
1496  }
1497 
1501  public function submitWords(array $wordList, string $phash): void
1502  {
1504  return;
1505  }
1506  $queryBuilder = $this->connectionPool->getQueryBuilderForTable('index_words');
1507  $result = $queryBuilder->select('wid')
1508  ->from('index_words')
1509  ->where(
1510  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, ‪Connection::PARAM_INT))
1511  )
1512  ->groupBy('wid')
1513  ->executeQuery();
1514 
1515  $stopWords = [];
1516  while ($row = $result->fetchAssociative()) {
1517  $stopWords[$row['wid']] = $row;
1518  }
1519 
1520  $this->connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => $phash]);
1521 
1522  ‪$fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1523  $rows = [];
1524  foreach ($wordList as $val) {
1525  if (isset($stopWords[$val['hash']])) {
1526  continue;
1527  }
1528  $rows[] = [
1529  $phash,
1530  $val['hash'],
1531  (int)$val['count'],
1532  (int)($val['first'] ?? 0),
1533  $this->freqMap($val['count'] / $this->wordcount),
1534  ($val['cmp'] ?? 0) & $this->flagBitMask,
1535  ];
1536  }
1537 
1538  if (!empty($rows)) {
1539  $this->connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, ‪$fields);
1540  }
1541  }
1542 
1550  public function freqMap(float $freq): int
1551  {
1552  $mapFactor = $this->freqMax * 100 * $this->freqRange;
1553  if ($freq <= 1) {
1554  $newFreq = $freq * $mapFactor;
1555  $newFreq = min($newFreq, $this->freqRange);
1556  } else {
1557  $newFreq = $freq / $mapFactor;
1558  }
1559  return (int)$newFreq;
1560  }
1561 
1562  /********************************
1563  *
1564  * Hashing
1565  *
1566  *******************************/
1570  public function setT3Hashes(): void
1571  {
1572  // Set main array:
1573  $hArray = [
1574  'id' => (int)$this->conf['id'],
1575  'type' => (int)$this->conf['type'],
1576  'sys_lang' => (int)$this->conf['sys_language_uid'],
1577  'MP' => (string)$this->conf['MP'],
1578  'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1579  ];
1580  // Set grouping hash (Identifies a "page" combined of id, type, language, mount point and cHash parameters):
1581  $this->hash['phash_grouping'] = md5(serialize($hArray));
1582  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1583  $hArray['gr_list'] = (string)$this->conf['gr_list'];
1584  $this->hash['phash'] = md5(serialize($hArray));
1585  }
1586 
1594  public function setExtHashes(string $file, array $subinfo = []): array
1595  {
1596  // Set main array:
1597  $hash = [];
1598  $hArray = [
1599  'file' => $file,
1600  ];
1601  // Set grouping hash:
1602  $hash['phash_grouping'] = md5(serialize($hArray));
1603  // Add subinfo
1604  $hArray['subinfo'] = $subinfo;
1605  $hash['phash'] = md5(serialize($hArray));
1606  return $hash;
1607  }
1608 
1609  public function log_setTSlogMessage(string $msg, string $logLevel = LogLevel::INFO): void
1610  {
1611  $this->timeTracker->setTSlogMessage($msg, $logLevel);
1612 
1613  if ($this->indexerConfig['debugMode'] ?? false) {
1614  $this->internal_log[] = $msg;
1615  }
1616  }
1617 
1626  protected function addSpacesToKeywordList(string $keywordList): string
1627  {
1628  $keywords = ‪GeneralUtility::trimExplode(',', $keywordList);
1629  return ' ' . implode(', ', $keywords) . ' ';
1630  }
1631 
1639  private function removePhashCollisions(array $wordList): array
1640  {
1641  $uniquePhashes = [];
1642  foreach ($wordList as $baseword => $wordData) {
1643  if (in_array($wordData['hash'], $uniquePhashes, true)) {
1644  unset($wordList[$baseword]);
1645  continue;
1646  }
1647  $uniquePhashes[] = $wordData['hash'];
1648  }
1649  return $wordList;
1650  }
1651 
1655  protected function milliseconds(): int
1656  {
1657  return (int)round(microtime(true) * 1000);
1658  }
1659 }
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:27
‪TYPO3\CMS\Core\Database\Connection\PARAM_INT
‪const PARAM_INT
Definition: Connection.php:52
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsString\fromArray
‪static fromArray(array $input)
Definition: IndexingDataAsString.php:35
‪TYPO3\CMS\Core\Utility\PathUtility\isAbsolutePath
‪static isAbsolutePath(string $path)
Definition: PathUtility.php:286
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsString
Definition: IndexingDataAsString.php:24
‪TYPO3\CMS\Core\Html\HtmlParser
Definition: HtmlParser.php:26
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:47
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static getPublicPath()
Definition: Environment.php:187
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility\isMysqlFullTextEnabled
‪static isMysqlFullTextEnabled()
Definition: IndexedSearchUtility.php:148
‪TYPO3\CMS\IndexedSearch\Type\IndexStatus
‪IndexStatus
Definition: IndexStatus.php:24
‪$fields
‪$fields
Definition: pages.php:5
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static basename(string $path)
Definition: PathUtility.php:219
‪TYPO3\CMS\Core\Utility\GeneralUtility\writeFile
‪static bool writeFile(string $file, string $content, bool $changePermissions=false)
Definition: GeneralUtility.php:1469
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static string false getUrl(string $url)
Definition: GeneralUtility.php:1444
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsArray\fromArray
‪static fromArray(array $input)
Definition: IndexingDataAsArray.php:37
‪TYPO3\CMS\Core\Http\RequestFactory
Definition: RequestFactory.php:30
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:41
‪TYPO3\CMS\Webhooks\Message\$url
‪identifier readonly UriInterface $url
Definition: LoginErrorOccurredMessage.php:36
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:25
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:41
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:24
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:46
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange(mixed $theInt, int $min, int $max=2000000000, int $defaultValue=0)
Definition: MathUtility.php:34
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsArray
Definition: IndexingDataAsArray.php:26
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:52
‪TYPO3\CMS\Core\TimeTracker\TimeTracker
Definition: TimeTracker.php:34
‪TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility
Definition: IndexedSearchUtility.php:29
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static list< string > trimExplode(string $delim, string $string, bool $removeEmptyValues=false, int $limit=0)
Definition: GeneralUtility.php:822