TYPO3 CMS  TYPO3_8-7
Indexer.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
24 
28 class Indexer
29 {
33  public $reasons = [
34  -1 => 'mtime matched the document, so no changes detected and no content updated',
35  -2 => 'The minimum age was not exceeded',
36  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
37  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
38  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
39  4 => 'Page has never been indexed (is not represented in the index_phash table).'
40  ];
41 
47  public $excludeSections = 'script,style';
48 
54  public $external_parsers = [];
55 
63  public $defaultGrList = '0,-1';
64 
70  public $tstamp_maxAge = 0;
71 
78  public $tstamp_minAge = 0;
79 
85  public $maxExternalFiles = 0;
86 
92  public $forceIndexing = false;
93 
99  public $crawlerActive = false;
100 
107  'title' => '',
108  'description' => '',
109  'keywords' => '',
110  'body' => ''
111  ];
112 
116  public $wordcount = 0;
117 
122 
126  public $conf = [];
127 
133  public $indexerConfig = [];
134 
140  public $hash = [];
141 
147  public $file_phash_arr = [];
148 
154  public $contentParts = [];
155 
161  public $content_md5h = '';
162 
166  public $internal_log = [];
167 
174 
178  public $cHashParams = [];
179 
185  public $freqRange = 32000;
186 
190  public $freqMax = 0.1;
191 
195  public $enableMetaphoneSearch = false;
196 
201 
205  public $metaphoneContent = '';
206 
212  public $csObj;
213 
220 
226  public $lexerObj;
227 
231  public $flagBitMask;
232 
236  protected $timeTracker;
237 
241  public function __construct()
242  {
243  $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
244  }
245 
251  public function hook_indexContent(&$pObj)
252  {
253  // Indexer configuration from Extension Manager interface:
254  $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
255  // Crawler activation:
256  // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
257  if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
258  // Setting simple log message:
259  $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
260  // Setting variables:
261  $this->crawlerActive = true;
262  // Crawler active flag
263  $this->forceIndexing = true;
264  }
265  // Determine if page should be indexed, and if so, configure and initialize indexer
266  if ($pObj->config['config']['index_enable']) {
267  $this->log_push('Index page', '');
268  if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
269  if (!$pObj->page['no_search']) {
270  if (!$pObj->no_cache) {
271  if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
272  // Setting up internal configuration from config array:
273  $this->conf = [];
274  // Information about page for which the indexing takes place
275  $this->conf['id'] = $pObj->id;
276  // Page id
277  $this->conf['type'] = $pObj->type;
278  // Page type
279  $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
280  // sys_language UID of the language of the indexing.
281  $this->conf['MP'] = $pObj->MP;
282  // MP variable, if any (Mount Points)
283  $this->conf['gr_list'] = $pObj->gr_list;
284  // Group list
285  $this->conf['cHash'] = $pObj->cHash;
286  // cHash string for additional parameters
287  $this->conf['cHash_array'] = $pObj->cHash_array;
288  // Array of the additional parameters
289  $this->conf['crdate'] = $pObj->page['crdate'];
290  // The creation date of the TYPO3 page
291  $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
292  // reg1 of the caching table. Not known what practical use this has.
293  // Root line uids
294  $this->conf['rootline_uids'] = [];
295  foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
296  $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
297  }
298  // Content of page:
299  $this->conf['content'] = $pObj->content;
300  // Content string (HTML of TYPO3 page)
301  $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
302  // Alternative title for indexing
303  $this->conf['metaCharset'] = $pObj->metaCharset;
304  // Character set of content (will be converted to utf-8 during indexing)
305  $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
306  // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
307  // Configuration of behavior:
308  $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
309  // Whether to index external documents like PDF, DOC etc. (if possible)
310  $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
311  // Length of description text (max 250, default 200)
312  $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
313  // Set to zero:
314  $this->conf['recordUid'] = 0;
315  $this->conf['freeIndexUid'] = 0;
316  $this->conf['freeIndexSetId'] = 0;
317  // Init and start indexing:
318  $this->init();
319  $this->indexTypo3PageContent();
320  } else {
321  $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
322  }
323  } else {
324  $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
325  }
326  } else {
327  $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
328  }
329  } else {
330  $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
331  }
332  $this->log_pull();
333  }
334  }
335 
336  /****************************
337  *
338  * Backend API
339  *
340  ****************************/
352  public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
353  {
354  // Setting up internal configuration from config array:
355  $this->conf = [];
356  // Information about page for which the indexing takes place
357  $this->conf['id'] = $id;
358  // Page id (int)
359  $this->conf['type'] = $type;
360  // Page type (int)
361  $this->conf['sys_language_uid'] = $sys_language_uid;
362  // sys_language UID of the language of the indexing (int)
363  $this->conf['MP'] = $MP;
364  // MP variable, if any (Mount Points) (string)
365  $this->conf['gr_list'] = '0,-1';
366  // Group list (hardcoded for now...)
367  // cHash values:
368  if ($createCHash) {
369  /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
370  $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
371  $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
372  } else {
373  $this->conf['cHash'] = '';
374  }
375  // cHash string for additional parameters
376  $this->conf['cHash_array'] = $cHash_array;
377  // Array of the additional parameters
378  // Set to defaults
379  $this->conf['freeIndexUid'] = 0;
380  $this->conf['freeIndexSetId'] = 0;
381  $this->conf['page_cache_reg1'] = 0;
382  // Root line uids
383  $this->conf['rootline_uids'] = $uidRL;
384  // Configuration of behavior:
385  $this->conf['index_externals'] = 1;
386  // Whether to index external documents like PDF, DOC etc. (if possible)
387  $this->conf['index_descrLgd'] = 200;
388  // Length of description text (max 250, default 200)
389  $this->conf['index_metatags'] = true;
390  // Whether to index document keywords and description (if present)
391  // Init and start indexing:
392  $this->init();
393  }
394 
401  public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
402  {
403  $this->conf['freeIndexUid'] = $freeIndexUid;
404  $this->conf['freeIndexSetId'] = $freeIndexSetId;
405  }
406 
419  public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
420  {
421  // Content of page:
422  $this->conf['mtime'] = $mtime;
423  // Most recent modification time (seconds) of the content
424  $this->conf['crdate'] = $crdate;
425  // The creation date of the TYPO3 content
426  $this->conf['recordUid'] = $recordUid;
427  // UID of the record, if applicable
428  // Construct fake HTML for parsing:
429  $this->conf['content'] = '
430  <html>
431  <head>
432  <title>' . htmlspecialchars($title) . '</title>
433  <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
434  <meta name="description" content="' . htmlspecialchars($description) . '" />
435  </head>
436  <body>
437  ' . htmlspecialchars($content) . '
438  </body>
439  </html>';
440  // Content string (HTML of TYPO3 page)
441  // Initializing charset:
442  $this->conf['metaCharset'] = $charset;
443  // Character set of content (will be converted to utf-8 during indexing)
444  $this->conf['indexedDocTitle'] = '';
445  // Alternative title for indexing
446  // Index content as if it was a TYPO3 page:
447  $this->indexTypo3PageContent();
448  }
449 
450  /********************************
451  *
452  * Initialization
453  *
454  *******************************/
458  public function init()
459  {
460  // Initializing:
461  $this->cHashParams = $this->conf['cHash_array'];
462  if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
463  if ($this->conf['cHash']) {
464  // Add this so that URL's come out right...
465  $this->cHashParams['cHash'] = $this->conf['cHash'];
466  }
467  unset($this->cHashParams['encryptionKey']);
468  }
469  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
470  $this->setT3Hashes();
471  // Indexer configuration from Extension Manager interface:
472  $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
473  $this->tstamp_minAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
474  $this->tstamp_maxAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
475  $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
476  $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
477  // Workaround: If the extension configuration was not updated yet, the value is not existing
478  $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
479  $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
480  // Initialize external document parsers:
481  // Example configuration, see ext_localconf.php of this file!
482  if ($this->conf['index_externals']) {
483  $this->initializeExternalParsers();
484  }
485  // Initialize lexer (class that deconstructs the text into words):
486  $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
487  $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
488  $this->lexerObj->debug = $this->indexerConfig['debugMode'];
489  // Initialize metaphone hook:
490  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
491  if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
492  $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
493  $this->metaphoneObj->pObj = $this;
494  }
495  // Init charset class:
496  $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
497  }
498 
505  public function initializeExternalParsers()
506  {
507  if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
508  foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
509  $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
510  $this->external_parsers[$extension]->pObj = $this;
511  // Init parser and if it returns FALSE, unset its entry again:
512  if (!$this->external_parsers[$extension]->initParser($extension)) {
513  unset($this->external_parsers[$extension]);
514  }
515  }
516  }
517  }
518 
519  /********************************
520  *
521  * Indexing; TYPO3 pages (HTML content)
522  *
523  *******************************/
527  public function indexTypo3PageContent()
528  {
529  $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
530  $is_grlist = $this->is_grlist_set($this->hash['phash']);
531  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
532  // Setting message:
533  if ($this->forceIndexing) {
534  $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
535  } elseif ($check > 0) {
536  $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
537  } else {
538  $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
539  }
540  // Divide into title,keywords,description and body:
541  $this->log_push('Split content', '');
542  $this->contentParts = $this->splitHTMLContent($this->conf['content']);
543  if ($this->conf['indexedDocTitle']) {
544  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
545  }
546  $this->log_pull();
547  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
548  $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
549  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
550  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
551  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
552  $checkCHash = $this->checkContentHash();
553  if (!is_array($checkCHash) || $check === 1) {
554  $Pstart = GeneralUtility::milliseconds();
555  $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
556  $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
557  $this->log_pull();
558  // Splitting words
559  $this->log_push('Extract words from content', '');
560  $splitInWords = $this->processWordsInArrays($this->contentParts);
561  $this->log_pull();
562  // Analyse the indexed words.
563  $this->log_push('Analyse the extracted words', '');
564  $indexArr = $this->indexAnalyze($splitInWords);
565  $this->log_pull();
566  // Submitting page (phash) record
567  $this->log_push('Submitting page', '');
568  $this->submitPage();
569  $this->log_pull();
570  // Check words and submit to word list if not there
571  $this->log_push('Check word list and submit words', '');
572  if (IndexedSearchUtility::isTableUsed('index_words')) {
573  $this->checkWordList($indexArr);
574  $this->submitWords($indexArr, $this->hash['phash']);
575  }
576  $this->log_pull();
577  // Set parsetime
578  $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
579  // Checking external files if configured for.
580  $this->log_push('Checking external files', '');
581  if ($this->conf['index_externals']) {
582  $this->extractLinks($this->conf['content']);
583  }
584  $this->log_pull();
585  } else {
586  // Update the timestamp
587  $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
588  $this->updateSetId($this->hash['phash']);
589  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
590  $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
591  $this->updateRootline();
592  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
593  }
594  } else {
595  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
596  }
597  }
598 
606  public function splitHTMLContent($content)
607  {
608  // divide head from body ( u-ouh :) )
609  $contentArr = $this->defaultContentArray;
610  $contentArr['body'] = stristr($content, '<body');
611  $headPart = substr($content, 0, -strlen($contentArr['body']));
612  // get title
613  $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
614  $titleParts = explode(':', $contentArr['title'], 2);
615  $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
616  // get keywords and description metatags
617  if ($this->conf['index_metatags']) {
618  $meta = [];
619  $i = 0;
620  while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
621  $i++;
622  }
623  // @todo The code below stops at first unset tag. Is that correct?
624  for ($i = 0; isset($meta[$i]); $i++) {
625  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
626  if (stristr($meta[$i]['name'], 'keywords')) {
627  $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
628  }
629  if (stristr($meta[$i]['name'], 'description')) {
630  $contentArr['description'] .= ',' . $meta[$i]['content'];
631  }
632  }
633  }
634  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
635  $this->typoSearchTags($contentArr['body']);
636  // Get rid of unwanted sections (ie. scripting and style stuff) in body
637  $tagList = explode(',', $this->excludeSections);
638  foreach ($tagList as $tag) {
639  while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
640  }
641  }
642  // remove tags, but first make sure we don't concatenate words by doing it
643  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
644  $contentArr['body'] = trim(strip_tags($contentArr['body']));
645  $contentArr['keywords'] = trim($contentArr['keywords']);
646  $contentArr['description'] = trim($contentArr['description']);
647  // Return array
648  return $contentArr;
649  }
650 
657  public function getHTMLcharset($content)
658  {
659  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
660  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
661  return $reg2[1];
662  }
663  }
664  }
665 
673  public function convertHTMLToUtf8($content, $charset = '')
674  {
675  // Find charset:
676  $charset = $charset ?: $this->getHTMLcharset($content);
677  $charset = trim(strtolower($charset));
678  // Convert charset:
679  if ($charset && $charset !== 'utf-8') {
680  $content = mb_convert_encoding($content, 'utf-8', $charset);
681  }
682  // Convert entities, assuming document is now UTF-8:
683  return html_entity_decode($content);
684  }
685 
698  public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
699  {
700  $endTag = '</' . $tagName . '>';
701  $startTag = '<' . $tagName;
702  // stristr used because we want a case-insensitive search for the tag.
703  $isTagInText = stristr($string, $startTag);
704  // if the tag was not found, return FALSE
705  if (!$isTagInText) {
706  return false;
707  }
708  list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
709  $afterTagInText = stristr($isTagInText, $endTag);
710  if ($afterTagInText) {
711  $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
712  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
713  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
714  } else {
715  $tagContent = '';
716  $stringAfter = $isTagInText;
717  }
718  return true;
719  }
720 
727  public function typoSearchTags(&$body)
728  {
729  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
730  if (count($expBody) > 1) {
731  $body = '';
732  foreach ($expBody as $val) {
733  $part = explode('-->', $val, 2);
734  if (trim($part[0]) === 'begin') {
735  $body .= $part[1];
736  $prev = '';
737  } elseif (trim($part[0]) === 'end') {
738  $body .= $prev;
739  } else {
740  $prev = $val;
741  }
742  }
743  return true;
744  }
745  return false;
746  }
747 
753  public function extractLinks($content)
754  {
755  // Get links:
756  $list = $this->extractHyperLinks($content);
757  if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
758  $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
759  }
760  // Traverse links:
761  foreach ($list as $linkInfo) {
762  // Decode entities:
763  if ($linkInfo['localPath']) {
764  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
765  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
766  } else {
767  $linkSource = htmlspecialchars_decode($linkInfo['href']);
768  }
769  // Parse URL:
770  $qParts = parse_url($linkSource);
771  // Check for jumpurl (TYPO3 specific thing...)
772  if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
773  parse_str($qParts['query'], $getP);
774  $linkSource = $getP['jumpurl'];
775  $qParts = parse_url($linkSource);
776  }
777  if (!$linkInfo['localPath'] && $qParts['scheme']) {
778  if ($this->indexerConfig['indexExternalURLs']) {
779  // Index external URL (http or otherwise)
780  $this->indexExternalUrl($linkSource);
781  }
782  } elseif (!$qParts['query']) {
783  $linkSource = urldecode($linkSource);
784  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
785  $localFile = $linkSource;
786  } else {
787  $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
788  }
789  if ($localFile && @is_file($localFile)) {
790  // Index local file:
791  if ($linkInfo['localPath']) {
792  $fI = pathinfo($linkSource);
793  $ext = strtolower($fI['extension']);
794  if (is_object($crawler)) {
795  $params = [
796  'document' => $linkSource,
797  'alturl' => $linkInfo['href'],
798  'conf' => $this->conf
799  ];
800  unset($params['conf']['content']);
801  $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
802  $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
803  } else {
804  $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
805  }
806  } else {
807  if (is_object($crawler)) {
808  $params = [
809  'document' => $linkSource,
810  'conf' => $this->conf
811  ];
812  unset($params['conf']['content']);
813  $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
814  $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
815  } else {
816  $this->indexRegularDocument($linkSource);
817  }
818  }
819  }
820  }
821  }
822  }
823 
831  public function extractHyperLinks($html)
832  {
833  $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
834  $htmlParts = $htmlParser->splitTags('a', $html);
835  $hyperLinksData = [];
836  foreach ($htmlParts as $index => $tagData) {
837  if ($index % 2 !== 0) {
838  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
839  $firstTagName = $htmlParser->getFirstTagName($tagData);
840  if (strtolower($firstTagName) === 'a') {
841  if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
842  $hyperLinksData[] = [
843  'tag' => $tagData,
844  'href' => $tagAttributes[0]['href'],
845  'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
846  ];
847  }
848  }
849  }
850  }
851  return $hyperLinksData;
852  }
853 
860  public function extractBaseHref($html)
861  {
862  $href = '';
863  $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
864  $htmlParts = $htmlParser->splitTags('base', $html);
865  foreach ($htmlParts as $index => $tagData) {
866  if ($index % 2 !== 0) {
867  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
868  $firstTagName = $htmlParser->getFirstTagName($tagData);
869  if (strtolower($firstTagName) === 'base') {
870  $href = $tagAttributes[0]['href'];
871  if ($href) {
872  break;
873  }
874  }
875  }
876  }
877  return $href;
878  }
879 
880  /******************************************
881  *
882  * Indexing; external URL
883  *
884  ******************************************/
891  public function indexExternalUrl($externalUrl)
892  {
893  // Get headers:
894  $urlHeaders = $this->getUrlHeaders($externalUrl);
895  if (stristr($urlHeaders['Content-Type'], 'text/html')) {
896  $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
897  if ((string)$content !== '') {
898  // Create temporary file:
899  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
900  if ($tmpFile) {
901  GeneralUtility::writeFile($tmpFile, $content);
902  // Index that file:
903  $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
904  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
905  unlink($tmpFile);
906  }
907  }
908  }
909  }
910 
917  public function getUrlHeaders($url)
918  {
919  // Try to get the headers only
920  $content = GeneralUtility::getUrl($url, 2);
921  if ((string)$content !== '') {
922  // Compile headers:
923  $headers = GeneralUtility::trimExplode(LF, $content, true);
924  $retVal = [];
925  foreach ($headers as $line) {
926  if (trim($line) === '') {
927  break;
928  }
929  list($headKey, $headValue) = explode(':', $line, 2);
930  $retVal[$headKey] = $headValue;
931  }
932  return $retVal;
933  }
934  }
935 
942  protected function createLocalPath($sourcePath)
943  {
944  $localPath = '';
945  static $pathFunctions = [
946  'createLocalPathFromT3vars',
947  'createLocalPathUsingAbsRefPrefix',
948  'createLocalPathUsingDomainURL',
949  'createLocalPathFromAbsoluteURL',
950  'createLocalPathFromRelativeURL'
951  ];
952  foreach ($pathFunctions as $functionName) {
953  $localPath = $this->{$functionName}($sourcePath);
954  if ($localPath != '') {
955  break;
956  }
957  }
958  return $localPath;
959  }
960 
969  protected function createLocalPathFromT3vars($sourcePath)
970  {
971  $localPath = '';
972  $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
973  if (is_array($indexLocalFiles)) {
974  $md5 = GeneralUtility::shortMD5($sourcePath);
975  // Note: not using self::isAllowedLocalFile here because this method
976  // is allowed to index files outside of the web site (for example,
977  // protected downloads)
978  if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
979  $localPath = $indexLocalFiles[$md5];
980  }
981  }
982  return $localPath;
983  }
984 
991  protected function createLocalPathUsingDomainURL($sourcePath)
992  {
993  $localPath = '';
994  $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
995  $baseURLLength = strlen($baseURL);
996  if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
997  $sourcePath = substr($sourcePath, $baseURLLength);
998  $localPath = PATH_site . $sourcePath;
999  if (!self::isAllowedLocalFile($localPath)) {
1000  $localPath = '';
1001  }
1002  }
1003  return $localPath;
1004  }
1005 
1013  protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1014  {
1015  $localPath = '';
1016  if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1017  $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1018  $absRefPrefixLength = strlen($absRefPrefix);
1019  if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1020  $sourcePath = substr($sourcePath, $absRefPrefixLength);
1021  $localPath = PATH_site . $sourcePath;
1022  if (!self::isAllowedLocalFile($localPath)) {
1023  $localPath = '';
1024  }
1025  }
1026  }
1027  return $localPath;
1028  }
1029 
1037  protected function createLocalPathFromAbsoluteURL($sourcePath)
1038  {
1039  $localPath = '';
1040  if ($sourcePath[0] === '/') {
1041  $sourcePath = substr($sourcePath, 1);
1042  $localPath = PATH_site . $sourcePath;
1043  if (!self::isAllowedLocalFile($localPath)) {
1044  $localPath = '';
1045  }
1046  }
1047  return $localPath;
1048  }
1049 
1056  protected function createLocalPathFromRelativeURL($sourcePath)
1057  {
1058  $localPath = '';
1059  if (self::isRelativeURL($sourcePath)) {
1060  $localPath = PATH_site . $sourcePath;
1061  if (!self::isAllowedLocalFile($localPath)) {
1062  $localPath = '';
1063  }
1064  }
1065  return $localPath;
1066  }
1067 
1074  protected static function isRelativeURL($url)
1075  {
1076  $urlParts = @parse_url($url);
1077  return $urlParts['scheme'] == '' && $urlParts['path'][0] !== '/';
1078  }
1079 
1086  protected static function isAllowedLocalFile($filePath)
1087  {
1088  $filePath = GeneralUtility::resolveBackPath($filePath);
1089  $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1090  $isFile = is_file($filePath);
1091  return $insideWebPath && $isFile;
1092  }
1093 
1094  /******************************************
1095  *
1096  * Indexing; external files (PDF, DOC, etc)
1097  *
1098  ******************************************/
1107  public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1108  {
1109  // Init
1110  $fI = pathinfo($file);
1111  $ext = $altExtension ?: strtolower($fI['extension']);
1112  // Create abs-path:
1113  if (!$contentTmpFile) {
1114  if (!GeneralUtility::isAbsPath($file)) {
1115  // Relative, prepend PATH_site:
1116  $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1117  } else {
1118  // Absolute, pass-through:
1119  $absFile = $file;
1120  }
1121  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1122  } else {
1123  $absFile = $contentTmpFile;
1124  }
1125  // Indexing the document:
1126  if ($absFile && @is_file($absFile)) {
1127  if ($this->external_parsers[$ext]) {
1128  $fileInfo = stat($absFile);
1129  $cParts = $this->fileContentParts($ext, $absFile);
1130  foreach ($cParts as $cPKey) {
1131  $this->internal_log = [];
1132  $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1133  $Pstart = GeneralUtility::milliseconds();
1134  $subinfo = ['key' => $cPKey];
1135  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1136  $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1137  $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1138  if ($check > 0 || $force) {
1139  if ($check > 0) {
1140  $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1141  } else {
1142  $this->log_setTSlogMessage('Indexing forced by flag', 1);
1143  }
1144  // Check external file counter:
1145  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1146  // Divide into title,keywords,description and body:
1147  $this->log_push('Split content', '');
1148  $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1149  $this->log_pull();
1150  if (is_array($contentParts)) {
1151  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1153  if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1154  // Increment counter:
1155  $this->externalFileCounter++;
1156  // Splitting words
1157  $this->log_push('Extract words from content', '');
1158  $splitInWords = $this->processWordsInArrays($contentParts);
1159  $this->log_pull();
1160  // Analyse the indexed words.
1161  $this->log_push('Analyse the extracted words', '');
1162  $indexArr = $this->indexAnalyze($splitInWords);
1163  $this->log_pull();
1164  // Submitting page (phash) record
1165  $this->log_push('Submitting page', '');
1166  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1167  $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1168  $this->log_pull();
1169  // Check words and submit to word list if not there
1170  $this->log_push('Check word list and submit words', '');
1171  if (IndexedSearchUtility::isTableUsed('index_words')) {
1172  $this->checkWordList($indexArr);
1173  $this->submitWords($indexArr, $phash_arr['phash']);
1174  }
1175  $this->log_pull();
1176  // Set parsetime
1177  $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1178  } else {
1179  // Update the timestamp
1180  $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1181  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1182  }
1183  } else {
1184  $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1185  }
1186  } else {
1187  $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1188  }
1189  } else {
1190  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1191  }
1192  // Checking and setting sections:
1193  $this->submitFile_section($phash_arr['phash']);
1194  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1195  $this->log_pull();
1196  }
1197  } else {
1198  $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1199  }
1200  } else {
1201  $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1202  }
1203  }
1204 
1214  public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1215  {
1216  $contentArray = null;
1217  // Consult relevant external document parser:
1218  if (is_object($this->external_parsers[$fileExtension])) {
1219  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1220  }
1221  return $contentArray;
1222  }
1223 
1231  public function fileContentParts($ext, $absFile)
1232  {
1233  $cParts = [0];
1234  // Consult relevant external document parser:
1235  if (is_object($this->external_parsers[$ext])) {
1236  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1237  }
1238  return $cParts;
1239  }
1240 
1248  public function splitRegularContent($content)
1249  {
1250  $contentArr = $this->defaultContentArray;
1251  $contentArr['body'] = $content;
1252  return $contentArr;
1253  }
1254 
1255  /**********************************
1256  *
1257  * Analysing content, Extracting words
1258  *
1259  **********************************/
1266  public function charsetEntity2utf8(&$contentArr, $charset)
1267  {
1268  // Convert charset if necessary
1269  foreach ($contentArr as $key => $value) {
1270  if ((string)$contentArr[$key] !== '') {
1271  if ($charset !== 'utf-8') {
1272  $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1273  }
1274  // decode all numeric / html-entities in the string to real characters:
1275  $contentArr[$key] = html_entity_decode($contentArr[$key]);
1276  }
1277  }
1278  }
1279 
1286  public function processWordsInArrays($contentArr)
1287  {
1288  // split all parts to words
1289  foreach ($contentArr as $key => $value) {
1290  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1291  }
1292  // For title, keywords, and description we don't want duplicates:
1293  $contentArr['title'] = array_unique($contentArr['title']);
1294  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1295  $contentArr['description'] = array_unique($contentArr['description']);
1296  // Return modified array:
1297  return $contentArr;
1298  }
1299 
1306  public function bodyDescription($contentArr)
1307  {
1308  // Setting description
1309  $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1310  if ($maxL) {
1311  $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1312  // Shorten the string:
1313  $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1314  }
1315  return $bodyDescription;
1316  }
1317 
1324  public function indexAnalyze($content)
1325  {
1326  $indexArr = [];
1327  $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1328  $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1329  $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1330  $this->analyzeBody($indexArr, $content);
1331  return $indexArr;
1332  }
1333 
1342  public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1343  {
1344  foreach ($content[$key] as $val) {
1345  $val = substr($val, 0, 60);
1346  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1347  if (!isset($retArr[$val])) {
1348  // Word ID (wid)
1349  $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1350  // Metaphone value is also 60 only chars long
1351  $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1352  $retArr[$val]['metaphone'] = $metaphone;
1353  }
1354  // Build metaphone fulltext string (can be used for fulltext indexing)
1355  if ($this->storeMetaphoneInfoAsWords) {
1356  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1357  }
1358  // Priority used for flagBitMask feature (see extension configuration)
1359  $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1360  // Increase number of occurrences
1361  $retArr[$val]['count']++;
1362  $this->wordcount++;
1363  }
1364  }
1365 
1372  public function analyzeBody(&$retArr, $content)
1373  {
1374  foreach ($content['body'] as $key => $val) {
1375  $val = substr($val, 0, 60);
1376  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1377  if (!isset($retArr[$val])) {
1378  // First occurrence (used for ranking results)
1379  $retArr[$val]['first'] = $key;
1380  // Word ID (wid)
1381  $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1382  // Metaphone value is also only 60 chars long
1383  $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1384  $retArr[$val]['metaphone'] = $metaphone;
1385  }
1386  // Build metaphone fulltext string (can be used for fulltext indexing)
1387  if ($this->storeMetaphoneInfoAsWords) {
1388  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1389  }
1390  // Increase number of occurrences
1391  $retArr[$val]['count']++;
1392  $this->wordcount++;
1393  }
1394  }
1395 
1403  public function metaphone($word, $returnRawMetaphoneValue = false)
1404  {
1405  if (is_object($this->metaphoneObj)) {
1406  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1407  } else {
1408  // Use native PHP function instead of advanced doubleMetaphone class
1409  $metaphoneRawValue = metaphone($word);
1410  }
1411  if ($returnRawMetaphoneValue) {
1412  $result = $metaphoneRawValue;
1413  } elseif ($metaphoneRawValue !== '') {
1414  // Create hash and return integer
1415  $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1416  } else {
1417  $result = 0;
1418  }
1419  return $result;
1420  }
1421 
1422  /********************************
1423  *
1424  * SQL; TYPO3 Pages
1425  *
1426  *******************************/
1430  public function submitPage()
1431  {
1432  // Remove any current data for this phash:
1433  $this->removeOldIndexedPages($this->hash['phash']);
1434  // setting new phash_row
1435  $fields = [
1436  'phash' => $this->hash['phash'],
1437  'phash_grouping' => $this->hash['phash_grouping'],
1438  'cHashParams' => serialize($this->cHashParams),
1439  'contentHash' => $this->content_md5h,
1440  'data_page_id' => $this->conf['id'],
1441  'data_page_reg1' => $this->conf['page_cache_reg1'],
1442  'data_page_type' => $this->conf['type'],
1443  'data_page_mp' => $this->conf['MP'],
1444  'gr_list' => $this->conf['gr_list'],
1445  'item_type' => 0,
1446  // TYPO3 page
1447  'item_title' => $this->contentParts['title'],
1448  'item_description' => $this->bodyDescription($this->contentParts),
1449  'item_mtime' => (int)$this->conf['mtime'],
1450  'item_size' => strlen($this->conf['content']),
1451  'tstamp' => $GLOBALS['EXEC_TIME'],
1452  'crdate' => $GLOBALS['EXEC_TIME'],
1453  'item_crdate' => $this->conf['crdate'],
1454  // Creation date of page
1455  'sys_language_uid' => $this->conf['sys_language_uid'],
1456  // Sys language uid of the page. Should reflect which language it DOES actually display!
1457  'externalUrl' => 0,
1458  'recordUid' => (int)$this->conf['recordUid'],
1459  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1460  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1461  ];
1462  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1463  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1464  ->getConnectionForTable('index_phash');
1465  $connection->insert(
1466  'index_phash',
1467  $fields,
1468  ['cHashParams' => Connection::PARAM_LOB]
1469  );
1470  }
1471  // PROCESSING index_section
1472  $this->submit_section($this->hash['phash'], $this->hash['phash']);
1473  // PROCESSING index_grlist
1474  $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1475  // PROCESSING index_fulltext
1476  $fields = [
1477  'phash' => $this->hash['phash'],
1478  'fulltextdata' => implode(' ', $this->contentParts),
1479  'metaphonedata' => $this->metaphoneContent
1480  ];
1481  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1482  $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1483  }
1484  if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1485  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1486  ->getConnectionForTable('index_fulltext');
1487  $connection->insert('index_fulltext', $fields);
1488  }
1489  // PROCESSING index_debug
1490  if ($this->indexerConfig['debugMode']) {
1491  $fields = [
1492  'phash' => $this->hash['phash'],
1493  'debuginfo' => serialize([
1494  'cHashParams' => $this->cHashParams,
1495  'external_parsers initialized' => array_keys($this->external_parsers),
1496  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1497  'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1498  'logs' => $this->internal_log,
1499  'lexer' => $this->lexerObj->debugString
1500  ])
1501  ];
1502  if (IndexedSearchUtility::isTableUsed('index_debug')) {
1503  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1504  ->getConnectionForTable('index_debug');
1505  $connection->insert('index_debug', $fields);
1506  }
1507  }
1508  }
1509 
1517  public function submit_grlist($hash, $phash_x)
1518  {
1519  // Setting the gr_list record
1520  $fields = [
1521  'phash' => $hash,
1522  'phash_x' => $phash_x,
1523  'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1524  'gr_list' => $this->conf['gr_list']
1525  ];
1526  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1527  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1528  ->getConnectionForTable('index_grlist');
1529  $connection->insert('index_grlist', $fields);
1530  }
1531  }
1532 
1540  public function submit_section($hash, $hash_t3)
1541  {
1542  $fields = [
1543  'phash' => $hash,
1544  'phash_t3' => $hash_t3,
1545  'page_id' => (int)$this->conf['id']
1546  ];
1547  $this->getRootLineFields($fields);
1548  if (IndexedSearchUtility::isTableUsed('index_section')) {
1549  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1550  ->getConnectionForTable('index_section');
1551  $connection->insert('index_section', $fields);
1552  }
1553  }
1554 
1560  public function removeOldIndexedPages($phash)
1561  {
1562  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1563  // there can be nothing else than 1-1 relations here.
1564  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1565  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1566  foreach ($tableArray as $table) {
1567  if (IndexedSearchUtility::isTableUsed($table)) {
1568  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1569  }
1570  }
1571 
1572  // Removing all index_section records with hash_t3 set to this hash (this includes such
1573  // records set for external media on the page as well!). The re-insert of these records
1574  // are done in indexRegularDocument($file).
1575  if (IndexedSearchUtility::isTableUsed('index_section')) {
1576  $connectionPool->getConnectionForTable('index_section')
1577  ->delete('index_section', ['phash_t3' => (int)$phash]);
1578  }
1579  }
1580 
1581  /********************************
1582  *
1583  * SQL; External media
1584  *
1585  *******************************/
1599  public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1600  {
1601  // Find item Type:
1602  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1603  $storeItemType = $storeItemType ?: $ext;
1604  // Remove any current data for this phash:
1605  $this->removeOldIndexedFiles($hash['phash']);
1606  // Split filename:
1607  $fileParts = parse_url($file);
1608  // Setting new
1609  $fields = [
1610  'phash' => $hash['phash'],
1611  'phash_grouping' => $hash['phash_grouping'],
1612  'cHashParams' => serialize($subinfo),
1613  'contentHash' => $content_md5h,
1614  'data_filename' => $file,
1615  'item_type' => $storeItemType,
1616  'item_title' => trim($contentParts['title']) ?: basename($file),
1617  'item_description' => $this->bodyDescription($contentParts),
1618  'item_mtime' => $mtime,
1619  'item_size' => $size,
1620  'item_crdate' => $ctime,
1621  'tstamp' => $GLOBALS['EXEC_TIME'],
1622  'crdate' => $GLOBALS['EXEC_TIME'],
1623  'gr_list' => $this->conf['gr_list'],
1624  'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1625  'recordUid' => (int)$this->conf['recordUid'],
1626  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1627  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1628  'sys_language_uid' => (int)$this->conf['sys_language_uid']
1629  ];
1630  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1631  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1632  ->getConnectionForTable('index_phash');
1633  $connection->insert(
1634  'index_phash',
1635  $fields,
1636  ['cHashParams' => Connection::PARAM_LOB]
1637  );
1638  }
1639  // PROCESSING index_fulltext
1640  $fields = [
1641  'phash' => $hash['phash'],
1642  'fulltextdata' => implode(' ', $contentParts),
1643  'metaphonedata' => $this->metaphoneContent
1644  ];
1645  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1646  $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1647  }
1648  if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1649  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1650  ->getConnectionForTable('index_fulltext');
1651  $connection->insert('index_fulltext', $fields);
1652  }
1653  // PROCESSING index_debug
1654  if ($this->indexerConfig['debugMode']) {
1655  $fields = [
1656  'phash' => $hash['phash'],
1657  'debuginfo' => serialize([
1658  'cHashParams' => $subinfo,
1659  'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1660  'logs' => $this->internal_log,
1661  'lexer' => $this->lexerObj->debugString
1662  ])
1663  ];
1664  if (IndexedSearchUtility::isTableUsed('index_debug')) {
1665  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1666  ->getConnectionForTable('index_debug');
1667  $connection->insert('index_debug', $fields);
1668  }
1669  }
1670  }
1671 
1677  public function submitFile_grlist($hash)
1678  {
1679  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1680  if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1681  return;
1682  }
1683 
1684  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1685  ->getQueryBuilderForTable('index_grlist');
1686  $count = (int)$queryBuilder->count('*')
1687  ->from('index_grlist')
1688  ->where(
1689  $queryBuilder->expr()->eq(
1690  'phash',
1691  $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1692  ),
1693  $queryBuilder->expr()->orX(
1694  $queryBuilder->expr()->eq(
1695  'hash_gr_list',
1696  $queryBuilder->createNamedParameter(
1697  IndexedSearchUtility::md5inthash($this->defaultGrList),
1698  \PDO::PARAM_INT
1699  )
1700  ),
1701  $queryBuilder->expr()->eq(
1702  'hash_gr_list',
1703  $queryBuilder->createNamedParameter(
1704  IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1705  \PDO::PARAM_INT
1706  )
1707  )
1708  )
1709  )
1710  ->execute()
1711  ->fetchColumn();
1712 
1713  if ($count === 0) {
1714  $this->submit_grlist($hash, $hash);
1715  }
1716  }
1717 
1723  public function submitFile_section($hash)
1724  {
1725  // Testing if there is already a section
1726  if (!IndexedSearchUtility::isTableUsed('index_section')) {
1727  return;
1728  }
1729 
1730  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1731  ->getQueryBuilderForTable('index_section');
1732  $count = (int)$queryBuilder->count('phash')
1733  ->from('index_section')
1734  ->where(
1735  $queryBuilder->expr()->eq(
1736  'phash',
1737  $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1738  ),
1739  $queryBuilder->expr()->eq(
1740  'page_id',
1741  $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1742  )
1743  )
1744  ->execute()
1745  ->fetchColumn();
1746 
1747  if ($count === 0) {
1748  $this->submit_section($hash, $this->hash['phash']);
1749  }
1750  }
1751 
1757  public function removeOldIndexedFiles($phash)
1758  {
1759  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1760  // Removing old registrations for tables.
1761  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1762  foreach ($tableArray as $table) {
1763  if (!IndexedSearchUtility::isTableUsed($table)) {
1764  continue;
1765  }
1766  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1767  }
1768  }
1769 
1770  /********************************
1771  *
1772  * SQL Helper functions
1773  *
1774  *******************************/
1783  public function checkMtimeTstamp($mtime, $phash)
1784  {
1785  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1786  // Not indexed (not in index_phash)
1787  $result = 4;
1788  } else {
1789  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1790  ->select(
1791  ['item_mtime', 'tstamp'],
1792  'index_phash',
1793  ['phash' => (int)$phash],
1794  [],
1795  [],
1796  1
1797  )
1798  ->fetch();
1799  // If there was an indexing of the page...:
1800  if (!empty($row)) {
1801  if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1802  // If max age is exceeded, index the page
1803  // The configured max-age was exceeded for the document and thus it's indexed.
1804  $result = 1;
1805  } else {
1806  if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1807  // if minAge is not set or if minAge is exceeded, consider at mtime
1808  if ($mtime) {
1809  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1810  if ($row['item_mtime'] != $mtime) {
1811  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1812  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1813  $result = 2;
1814  } else {
1815  // mtime matched the document, so no changes detected and no content updated
1816  $result = -1;
1817  if ($this->tstamp_maxAge) {
1818  $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1819  } else {
1820  $this->updateTstamp($phash);
1821  $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1822  }
1823  }
1824  } else {
1825  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1826  $result = 3;
1827  }
1828  } else {
1829  // The minimum age was not exceeded
1830  $result = -2;
1831  }
1832  }
1833  } else {
1834  // Page has never been indexed (is not represented in the index_phash table).
1835  $result = 4;
1836  }
1837  }
1838  return $result;
1839  }
1840 
1846  public function checkContentHash()
1847  {
1848  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1849  $result = true;
1850  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1851  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1852  ->select(
1853  ['phash'],
1854  'index_phash',
1855  [
1856  'phash_grouping' => (int)$this->hash['phash_grouping'],
1857  'contentHash' => (int)$this->content_md5h
1858  ],
1859  [],
1860  [],
1861  1
1862  )
1863  ->fetch();
1864 
1865  if (!empty($row)) {
1866  $result = $row;
1867  }
1868  }
1869  return $result;
1870  }
1871 
1880  public function checkExternalDocContentHash($hashGr, $content_md5h)
1881  {
1882  $result = true;
1883  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1884  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1885  ->getConnectionForTable('index_phash')
1886  ->count(
1887  '*',
1888  'index_phash',
1889  [
1890  'phash_grouping' => (int)$hashGr,
1891  'contentHash' => (int)$content_md5h
1892  ]
1893  );
1894 
1895  $result = $count === 0;
1896  }
1897  return $result;
1898  }
1899 
1906  public function is_grlist_set($phash_x)
1907  {
1908  $result = false;
1909  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1910  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1911  ->getConnectionForTable('index_grlist')
1912  ->count(
1913  'phash_x',
1914  'index_grlist',
1915  ['phash_x' => (int)$phash_x]
1916  );
1917 
1918  $result = $count > 0;
1919  }
1920  return $result;
1921  }
1922 
1930  public function update_grlist($phash, $phash_x)
1931  {
1932  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1933  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1934  ->getConnectionForTable('index_grlist')
1935  ->count(
1936  'phash',
1937  'index_grlist',
1938  [
1939  'phash' => (int)$phash,
1940  'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1941  ]
1942  );
1943 
1944  if ($count === 0) {
1945  $this->submit_grlist($phash, $phash_x);
1946  $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1947  }
1948  }
1949  }
1950 
1957  public function updateTstamp($phash, $mtime = 0)
1958  {
1959  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1960  return;
1961  }
1962 
1963  $updateFields = [
1964  'tstamp' => $GLOBALS['EXEC_TIME']
1965  ];
1966 
1967  if ($mtime) {
1968  $updateFields['item_mtime'] = (int)$mtime;
1969  }
1970 
1971  GeneralUtility::makeInstance(ConnectionPool::class)
1972  ->getConnectionForTable('index_phash')
1973  ->update(
1974  'index_phash',
1975  $updateFields,
1976  [
1977  'phash' => (int)$phash
1978  ]
1979  );
1980  }
1981 
1987  public function updateSetId($phash)
1988  {
1989  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1990  return;
1991  }
1992 
1993  GeneralUtility::makeInstance(ConnectionPool::class)
1994  ->getConnectionForTable('index_phash')
1995  ->update(
1996  'index_phash',
1997  [
1998  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1999  ],
2000  [
2001  'phash' => (int)$phash
2002  ]
2003  );
2004  }
2005 
2012  public function updateParsetime($phash, $parsetime)
2013  {
2014  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2015  return;
2016  }
2017 
2018  GeneralUtility::makeInstance(ConnectionPool::class)
2019  ->getConnectionForTable('index_phash')
2020  ->update(
2021  'index_phash',
2022  [
2023  'parsetime' => (int)$parsetime
2024  ],
2025  [
2026  'phash' => (int)$phash
2027  ]
2028  );
2029  }
2030 
2034  public function updateRootline()
2035  {
2036  if (!IndexedSearchUtility::isTableUsed('index_section')) {
2037  return;
2038  }
2039 
2040  $updateFields = [];
2041  $this->getRootLineFields($updateFields);
2042 
2043  GeneralUtility::makeInstance(ConnectionPool::class)
2044  ->getConnectionForTable('index_section')
2045  ->update(
2046  'index_section',
2047  $updateFields,
2048  [
2049  'page_id' => (int)$this->conf['id']
2050  ]
2051  );
2052  }
2053 
2060  public function getRootLineFields(array &$fieldArray)
2061  {
2062  $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2063  $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2064  $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2065  if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
2066  foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
2067  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2068  }
2069  }
2070  }
2071 
2077  public function includeCrawlerClass()
2078  {
2080  require_once \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php';
2081  }
2082 
2083  /********************************
2084  *
2085  * SQL; Submitting words
2086  *
2087  *******************************/
2093  public function checkWordList($wordListArray)
2094  {
2095  if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2096  return;
2097  }
2098 
2099  $wordListArrayCount = count($wordListArray);
2100  $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2101 
2102  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2103  $count = (int)$queryBuilder->count('baseword')
2104  ->from('index_words')
2105  ->where(
2106  $queryBuilder->expr()->in(
2107  'wid',
2108  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2109  )
2110  )
2111  ->execute()
2112  ->fetchColumn();
2113 
2114  if ($count !== $wordListArrayCount) {
2115  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2116  $queryBuilder = $connection->createQueryBuilder();
2117 
2118  $result = $queryBuilder->select('baseword')
2119  ->from('index_words')
2120  ->where(
2121  $queryBuilder->expr()->in(
2122  'wid',
2123  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2124  )
2125  )
2126  ->execute();
2127 
2128  $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2129  while ($row = $result->fetch()) {
2130  unset($wordListArray[$row['baseword']]);
2131  }
2132 
2133  foreach ($wordListArray as $key => $val) {
2134  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2135  // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2136  // this is not a problem.
2137  $connection->insert(
2138  'index_words',
2139  [
2140  'wid' => $val['hash'],
2141  'baseword' => $key,
2142  'metaphone' => $val['metaphone']
2143  ]
2144  );
2145  }
2146  }
2147  }
2148 
2155  public function submitWords($wordList, $phash)
2156  {
2157  if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2158  return;
2159  }
2160  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2161  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2162  $result = $queryBuilder->select('wid')
2163  ->from('index_words')
2164  ->where(
2165  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2166  )
2167  ->groupBy('wid')
2168  ->execute();
2169 
2170  $stopWords = [];
2171  while ($row = $result->fetch()) {
2172  $stopWords[$row['wid']] = $row;
2173  }
2174 
2175  $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2176 
2177  $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2178  $rows = [];
2179  foreach ($wordList as $val) {
2180  if (isset($stopWords[$val['hash']])) {
2181  continue;
2182  }
2183  $rows[] = [
2184  (int)$phash,
2185  (int)$val['hash'],
2186  (int)$val['count'],
2187  (int)$val['first'],
2188  $this->freqMap($val['count'] / $this->wordcount),
2189  $val['cmp'] & $this->flagBitMask
2190  ];
2191  }
2192 
2193  if (!empty($rows)) {
2194  $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2195  }
2196  }
2197 
2205  public function freqMap($freq)
2206  {
2207  $mapFactor = $this->freqMax * 100 * $this->freqRange;
2208  if ($freq <= 1) {
2209  $newFreq = $freq * $mapFactor;
2210  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2211  } else {
2212  $newFreq = $freq / $mapFactor;
2213  }
2214  return $newFreq;
2215  }
2216 
2217  /********************************
2218  *
2219  * Hashing
2220  *
2221  *******************************/
2225  public function setT3Hashes()
2226  {
2227  // Set main array:
2228  $hArray = [
2229  'id' => (int)$this->conf['id'],
2230  'type' => (int)$this->conf['type'],
2231  'sys_lang' => (int)$this->conf['sys_language_uid'],
2232  'MP' => (string)$this->conf['MP'],
2233  'cHash' => $this->cHashParams
2234  ];
2235  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2236  $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2237  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2238  $hArray['gr_list'] = (string)$this->conf['gr_list'];
2239  $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2240  }
2241 
2249  public function setExtHashes($file, $subinfo = [])
2250  {
2251  // Set main array:
2252  $hash = [];
2253  $hArray = [
2254  'file' => $file
2255  ];
2256  // Set grouping hash:
2257  $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2258  // Add subinfo
2259  $hArray['subinfo'] = $subinfo;
2260  $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2261  return $hash;
2262  }
2263 
2264  /*********************************
2265  *
2266  * Internal logging functions
2267  *
2268  *********************************/
2275  public function log_push($msg, $key)
2276  {
2277  $this->timeTracker->push($msg, $key);
2278  }
2279 
2283  public function log_pull()
2284  {
2285  $this->timeTracker->pull();
2286  }
2287 
2294  public function log_setTSlogMessage($msg, $errorNum = 0)
2295  {
2296  $this->timeTracker->setTSlogMessage($msg, $errorNum);
2297  $this->internal_log[] = $msg;
2298  }
2299 
2308  protected function addSpacesToKeywordList($keywordList)
2309  {
2310  $keywords = GeneralUtility::trimExplode(',', $keywordList);
2311  return ' ' . implode(', ', $keywords) . ' ';
2312  }
2313 }
analyzeHeaderinfo(&$retArr, $content, $key, $offset)
Definition: Indexer.php:1342
submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
Definition: Indexer.php:1599
submit_grlist($hash, $phash_x)
Definition: Indexer.php:1517
setExtHashes($file, $subinfo=[])
Definition: Indexer.php:2249
indexRegularDocument($file, $force=false, $contentTmpFile='', $altExtension='')
Definition: Indexer.php:1107
backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=[], $createCHash=false)
Definition: Indexer.php:352
backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
Definition: Indexer.php:419
static isAllowedLocalFile($filePath)
Definition: Indexer.php:1086
checkExternalDocContentHash($hashGr, $content_md5h)
Definition: Indexer.php:1880
convertHTMLToUtf8($content, $charset='')
Definition: Indexer.php:673
static forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:31
update_grlist($phash, $phash_x)
Definition: Indexer.php:1930
embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
Definition: Indexer.php:698
createLocalPathFromRelativeURL($sourcePath)
Definition: Indexer.php:1056
static getFileAbsFileName($filename, $_=null, $_2=null)
createLocalPathFromT3vars($sourcePath)
Definition: Indexer.php:969
updateTstamp($phash, $mtime=0)
Definition: Indexer.php:1957
getRootLineFields(array &$fieldArray)
Definition: Indexer.php:2060
static trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
metaphone($word, $returnRawMetaphoneValue=false)
Definition: Indexer.php:1403
addSpacesToKeywordList($keywordList)
Definition: Indexer.php:2308
submitWords($wordList, $phash)
Definition: Indexer.php:2155
static makeInstance($className,... $constructorArguments)
fileContentParts($ext, $absFile)
Definition: Indexer.php:1231
$fields
Definition: pages.php:4
updateParsetime($phash, $parsetime)
Definition: Indexer.php:2012
readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
Definition: Indexer.php:1214
static implodeArrayForUrl($name, array $theArray, $str='', $skipBlank=false, $rawurlencodeParamName=false)
static tempnam($filePrefix, $fileSuffix='')
analyzeBody(&$retArr, $content)
Definition: Indexer.php:1372
createLocalPathFromAbsoluteURL($sourcePath)
Definition: Indexer.php:1037
createLocalPathUsingDomainURL($sourcePath)
Definition: Indexer.php:991
charsetEntity2utf8(&$contentArr, $charset)
Definition: Indexer.php:1266
backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
Definition: Indexer.php:401
if(TYPO3_MODE==='BE') $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tsfebeuserauth.php']['frontendEditingController']['default']
static writeFile($file, $content, $changePermissions=false)
checkMtimeTstamp($mtime, $phash)
Definition: Indexer.php:1783
submit_section($hash, $hash_t3)
Definition: Indexer.php:1540
log_setTSlogMessage($msg, $errorNum=0)
Definition: Indexer.php:2294
indexExternalUrl($externalUrl)
Definition: Indexer.php:891
checkWordList($wordListArray)
Definition: Indexer.php:2093
createLocalPathUsingAbsRefPrefix($sourcePath)
Definition: Indexer.php:1013