TYPO3CMS  8
 All Classes Namespaces Files Functions Variables Pages
indexed_search/Classes/Indexer.php
Go to the documentation of this file.
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
24 
28 class Indexer
29 {
33  public $reasons = [
34  -1 => 'mtime matched the document, so no changes detected and no content updated',
35  -2 => 'The minimum age was not exceeded',
36  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
37  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
38  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
39  4 => 'Page has never been indexed (is not represented in the index_phash table).'
40  ];
41 
47  public $excludeSections = 'script,style';
48 
54  public $external_parsers = [];
55 
63  public $defaultGrList = '0,-1';
64 
70  public $tstamp_maxAge = 0;
71 
78  public $tstamp_minAge = 0;
79 
85  public $maxExternalFiles = 0;
86 
92  public $forceIndexing = false;
93 
99  public $crawlerActive = false;
100 
107  'title' => '',
108  'description' => '',
109  'keywords' => '',
110  'body' => ''
111  ];
112 
116  public $wordcount = 0;
117 
122 
126  public $conf = [];
127 
133  public $indexerConfig = [];
134 
140  public $hash = [];
141 
147  public $file_phash_arr = [];
148 
154  public $contentParts = [];
155 
161  public $content_md5h = '';
162 
166  public $internal_log = [];
167 
174 
178  public $cHashParams = [];
179 
185  public $freqRange = 32000;
186 
190  public $freqMax = 0.1;
191 
195  public $enableMetaphoneSearch = false;
196 
201 
205  public $metaphoneContent = '';
206 
212  public $csObj;
213 
220 
226  public $lexerObj;
227 
231  public $flagBitMask;
232 
236  protected $timeTracker;
237 
241  public function __construct()
242  {
243  $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
244  }
245 
252  public function hook_indexContent(&$pObj)
253  {
254  // Indexer configuration from Extension Manager interface:
255  $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
256  // Crawler activation:
257  // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
258  if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
259  // Setting simple log message:
260  $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
261  // Setting variables:
262  $this->crawlerActive = true;
263  // Crawler active flag
264  $this->forceIndexing = true;
265  }
266  // Determine if page should be indexed, and if so, configure and initialize indexer
267  if ($pObj->config['config']['index_enable']) {
268  $this->log_push('Index page', '');
269  if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
270  if (!$pObj->page['no_search']) {
271  if (!$pObj->no_cache) {
272  if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
273  // Setting up internal configuration from config array:
274  $this->conf = [];
275  // Information about page for which the indexing takes place
276  $this->conf['id'] = $pObj->id;
277  // Page id
278  $this->conf['type'] = $pObj->type;
279  // Page type
280  $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
281  // sys_language UID of the language of the indexing.
282  $this->conf['MP'] = $pObj->MP;
283  // MP variable, if any (Mount Points)
284  $this->conf['gr_list'] = $pObj->gr_list;
285  // Group list
286  $this->conf['cHash'] = $pObj->cHash;
287  // cHash string for additional parameters
288  $this->conf['cHash_array'] = $pObj->cHash_array;
289  // Array of the additional parameters
290  $this->conf['crdate'] = $pObj->page['crdate'];
291  // The creation date of the TYPO3 page
292  $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
293  // reg1 of the caching table. Not known what practical use this has.
294  // Root line uids
295  $this->conf['rootline_uids'] = [];
296  foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
297  $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
298  }
299  // Content of page:
300  $this->conf['content'] = $pObj->content;
301  // Content string (HTML of TYPO3 page)
302  $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
303  // Alternative title for indexing
304  $this->conf['metaCharset'] = $pObj->metaCharset;
305  // Character set of content (will be converted to utf-8 during indexing)
306  $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
307  // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
308  // Configuration of behavior:
309  $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
310  // Whether to index external documents like PDF, DOC etc. (if possible)
311  $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
312  // Length of description text (max 250, default 200)
313  $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
314  // Set to zero:
315  $this->conf['recordUid'] = 0;
316  $this->conf['freeIndexUid'] = 0;
317  $this->conf['freeIndexSetId'] = 0;
318  // Init and start indexing:
319  $this->init();
320  $this->indexTypo3PageContent();
321  } else {
322  $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
323  }
324  } else {
325  $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
326  }
327  } else {
328  $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
329  }
330  } else {
331  $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
332  }
333  $this->log_pull();
334  }
335  }
336 
337  /****************************
338  *
339  * Backend API
340  *
341  ****************************/
354  public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
355  {
356  // Setting up internal configuration from config array:
357  $this->conf = [];
358  // Information about page for which the indexing takes place
359  $this->conf['id'] = $id;
360  // Page id (int)
361  $this->conf['type'] = $type;
362  // Page type (int)
363  $this->conf['sys_language_uid'] = $sys_language_uid;
364  // sys_language UID of the language of the indexing (int)
365  $this->conf['MP'] = $MP;
366  // MP variable, if any (Mount Points) (string)
367  $this->conf['gr_list'] = '0,-1';
368  // Group list (hardcoded for now...)
369  // cHash values:
370  if ($createCHash) {
371  /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
372  $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
373  $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
374  } else {
375  $this->conf['cHash'] = '';
376  }
377  // cHash string for additional parameters
378  $this->conf['cHash_array'] = $cHash_array;
379  // Array of the additional parameters
380  // Set to defaults
381  $this->conf['freeIndexUid'] = 0;
382  $this->conf['freeIndexSetId'] = 0;
383  $this->conf['page_cache_reg1'] = '';
384  // Root line uids
385  $this->conf['rootline_uids'] = $uidRL;
386  // Configuration of behavior:
387  $this->conf['index_externals'] = 1;
388  // Whether to index external documents like PDF, DOC etc. (if possible)
389  $this->conf['index_descrLgd'] = 200;
390  // Length of description text (max 250, default 200)
391  $this->conf['index_metatags'] = true;
392  // Whether to index document keywords and description (if present)
393  // Init and start indexing:
394  $this->init();
395  }
396 
404  public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
405  {
406  $this->conf['freeIndexUid'] = $freeIndexUid;
407  $this->conf['freeIndexSetId'] = $freeIndexSetId;
408  }
409 
423  public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
424  {
425  // Content of page:
426  $this->conf['mtime'] = $mtime;
427  // Most recent modification time (seconds) of the content
428  $this->conf['crdate'] = $crdate;
429  // The creation date of the TYPO3 content
430  $this->conf['recordUid'] = $recordUid;
431  // UID of the record, if applicable
432  // Construct fake HTML for parsing:
433  $this->conf['content'] = '
434  <html>
435  <head>
436  <title>' . htmlspecialchars($title) . '</title>
437  <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
438  <meta name="description" content="' . htmlspecialchars($description) . '" />
439  </head>
440  <body>
441  ' . htmlspecialchars($content) . '
442  </body>
443  </html>';
444  // Content string (HTML of TYPO3 page)
445  // Initializing charset:
446  $this->conf['metaCharset'] = $charset;
447  // Character set of content (will be converted to utf-8 during indexing)
448  $this->conf['indexedDocTitle'] = '';
449  // Alternative title for indexing
450  // Index content as if it was a TYPO3 page:
451  $this->indexTypo3PageContent();
452  }
453 
454  /********************************
455  *
456  * Initialization
457  *
458  *******************************/
464  public function init()
465  {
466  // Initializing:
467  $this->cHashParams = $this->conf['cHash_array'];
468  if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
469  if ($this->conf['cHash']) {
470  // Add this so that URL's come out right...
471  $this->cHashParams['cHash'] = $this->conf['cHash'];
472  }
473  unset($this->cHashParams['encryptionKey']);
474  }
475  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
476  $this->setT3Hashes();
477  // Indexer configuration from Extension Manager interface:
478  $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
479  $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
480  $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
481  $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
482  $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
483  // Workaround: If the extension configuration was not updated yet, the value is not existing
484  $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
485  $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
486  // Initialize external document parsers:
487  // Example configuration, see ext_localconf.php of this file!
488  if ($this->conf['index_externals']) {
489  $this->initializeExternalParsers();
490  }
491  // Initialize lexer (class that deconstructs the text into words):
492  $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
493  $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
494  $this->lexerObj->debug = $this->indexerConfig['debugMode'];
495  // Initialize metaphone hook:
496  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
497  if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
498  $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
499  $this->metaphoneObj->pObj = $this;
500  }
501  // Init charset class:
502  $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
503  }
504 
512  public function initializeExternalParsers()
513  {
514  if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
515  foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
516  $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
517  $this->external_parsers[$extension]->pObj = $this;
518  // Init parser and if it returns FALSE, unset its entry again:
519  if (!$this->external_parsers[$extension]->initParser($extension)) {
520  unset($this->external_parsers[$extension]);
521  }
522  }
523  }
524  }
525 
526  /********************************
527  *
528  * Indexing; TYPO3 pages (HTML content)
529  *
530  *******************************/
536  public function indexTypo3PageContent()
537  {
538  $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
539  $is_grlist = $this->is_grlist_set($this->hash['phash']);
540  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
541  // Setting message:
542  if ($this->forceIndexing) {
543  $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
544  } elseif ($check > 0) {
545  $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
546  } else {
547  $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
548  }
549  // Divide into title,keywords,description and body:
550  $this->log_push('Split content', '');
551  $this->contentParts = $this->splitHTMLContent($this->conf['content']);
552  if ($this->conf['indexedDocTitle']) {
553  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
554  }
555  $this->log_pull();
556  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
557  $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
558  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
559  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
560  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
561  $checkCHash = $this->checkContentHash();
562  if (!is_array($checkCHash) || $check === 1) {
563  $Pstart = GeneralUtility::milliseconds();
564  $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
565  $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
566  $this->log_pull();
567  // Splitting words
568  $this->log_push('Extract words from content', '');
569  $splitInWords = $this->processWordsInArrays($this->contentParts);
570  $this->log_pull();
571  // Analyse the indexed words.
572  $this->log_push('Analyse the extracted words', '');
573  $indexArr = $this->indexAnalyze($splitInWords);
574  $this->log_pull();
575  // Submitting page (phash) record
576  $this->log_push('Submitting page', '');
577  $this->submitPage();
578  $this->log_pull();
579  // Check words and submit to word list if not there
580  $this->log_push('Check word list and submit words', '');
581  if (IndexedSearchUtility::isTableUsed('index_words')) {
582  $this->checkWordList($indexArr);
583  $this->submitWords($indexArr, $this->hash['phash']);
584  }
585  $this->log_pull();
586  // Set parsetime
587  $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
588  // Checking external files if configured for.
589  $this->log_push('Checking external files', '');
590  if ($this->conf['index_externals']) {
591  $this->extractLinks($this->conf['content']);
592  }
593  $this->log_pull();
594  } else {
595  // Update the timestamp
596  $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
597  $this->updateSetId($this->hash['phash']);
598  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
599  $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
600  $this->updateRootline();
601  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
602  }
603  } else {
604  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
605  }
606  }
607 
615  public function splitHTMLContent($content)
616  {
617  // divide head from body ( u-ouh :) )
618  $contentArr = $this->defaultContentArray;
619  $contentArr['body'] = stristr($content, '<body');
620  $headPart = substr($content, 0, -strlen($contentArr['body']));
621  // get title
622  $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
623  $titleParts = explode(':', $contentArr['title'], 2);
624  $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
625  // get keywords and description metatags
626  if ($this->conf['index_metatags']) {
627  $meta = [];
628  $i = 0;
629  while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
630  $i++;
631  }
632  // @todo The code below stops at first unset tag. Is that correct?
633  for ($i = 0; isset($meta[$i]); $i++) {
634  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
635  if (stristr($meta[$i]['name'], 'keywords')) {
636  $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
637  }
638  if (stristr($meta[$i]['name'], 'description')) {
639  $contentArr['description'] .= ',' . $meta[$i]['content'];
640  }
641  }
642  }
643  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
644  $this->typoSearchTags($contentArr['body']);
645  // Get rid of unwanted sections (ie. scripting and style stuff) in body
646  $tagList = explode(',', $this->excludeSections);
647  foreach ($tagList as $tag) {
648  while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
649  }
650  }
651  // remove tags, but first make sure we don't concatenate words by doing it
652  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
653  $contentArr['body'] = trim(strip_tags($contentArr['body']));
654  $contentArr['keywords'] = trim($contentArr['keywords']);
655  $contentArr['description'] = trim($contentArr['description']);
656  // Return array
657  return $contentArr;
658  }
659 
666  public function getHTMLcharset($content)
667  {
668  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
669  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
670  return $reg2[1];
671  }
672  }
673  }
674 
682  public function convertHTMLToUtf8($content, $charset = '')
683  {
684  // Find charset:
685  $charset = $charset ?: $this->getHTMLcharset($content);
686  $charset = $this->csObj->parse_charset($charset);
687  // Convert charset:
688  if ($charset && $charset !== 'utf-8') {
689  $content = $this->csObj->conv($content, $charset, 'utf-8');
690  }
691  // Convert entities, assuming document is now UTF-8:
692  return $this->csObj->entities_to_utf8($content);
693  }
694 
707  public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
708  {
709  $endTag = '</' . $tagName . '>';
710  $startTag = '<' . $tagName;
711  // stristr used because we want a case-insensitive search for the tag.
712  $isTagInText = stristr($string, $startTag);
713  // if the tag was not found, return FALSE
714  if (!$isTagInText) {
715  return false;
716  }
717  list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
718  $afterTagInText = stristr($isTagInText, $endTag);
719  if ($afterTagInText) {
720  $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
721  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
722  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
723  } else {
724  $tagContent = '';
725  $stringAfter = $isTagInText;
726  }
727  return true;
728  }
729 
736  public function typoSearchTags(&$body)
737  {
738  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
739  if (count($expBody) > 1) {
740  $body = '';
741  foreach ($expBody as $val) {
742  $part = explode('-->', $val, 2);
743  if (trim($part[0]) == 'begin') {
744  $body .= $part[1];
745  $prev = '';
746  } elseif (trim($part[0]) == 'end') {
747  $body .= $prev;
748  } else {
749  $prev = $val;
750  }
751  }
752  return true;
753  } else {
754  return false;
755  }
756  }
757 
764  public function extractLinks($content)
765  {
766  // Get links:
767  $list = $this->extractHyperLinks($content);
768  if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
769  $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
770  }
771  // Traverse links:
772  foreach ($list as $linkInfo) {
773  // Decode entities:
774  if ($linkInfo['localPath']) {
775  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
776  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
777  } else {
778  $linkSource = htmlspecialchars_decode($linkInfo['href']);
779  }
780  // Parse URL:
781  $qParts = parse_url($linkSource);
782  // Check for jumpurl (TYPO3 specific thing...)
783  if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
784  parse_str($qParts['query'], $getP);
785  $linkSource = $getP['jumpurl'];
786  $qParts = parse_url($linkSource);
787  }
788  if (!$linkInfo['localPath'] && $qParts['scheme']) {
789  if ($this->indexerConfig['indexExternalURLs']) {
790  // Index external URL (http or otherwise)
791  $this->indexExternalUrl($linkSource);
792  }
793  } elseif (!$qParts['query']) {
794  $linkSource = urldecode($linkSource);
795  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
796  $localFile = $linkSource;
797  } else {
798  $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
799  }
800  if ($localFile && @is_file($localFile)) {
801  // Index local file:
802  if ($linkInfo['localPath']) {
803  $fI = pathinfo($linkSource);
804  $ext = strtolower($fI['extension']);
805  if (is_object($crawler)) {
806  $params = [
807  'document' => $linkSource,
808  'alturl' => $linkInfo['href'],
809  'conf' => $this->conf
810  ];
811  unset($params['conf']['content']);
812  $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
813  $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
814  } else {
815  $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
816  }
817  } else {
818  if (is_object($crawler)) {
819  $params = [
820  'document' => $linkSource,
821  'conf' => $this->conf
822  ];
823  unset($params['conf']['content']);
824  $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
825  $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
826  } else {
827  $this->indexRegularDocument($linkSource);
828  }
829  }
830  }
831  }
832  }
833  }
834 
842  public function extractHyperLinks($html)
843  {
844  $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
845  $htmlParts = $htmlParser->splitTags('a', $html);
846  $hyperLinksData = [];
847  foreach ($htmlParts as $index => $tagData) {
848  if ($index % 2 !== 0) {
849  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
850  $firstTagName = $htmlParser->getFirstTagName($tagData);
851  if (strtolower($firstTagName) === 'a') {
852  if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
853  $hyperLinksData[] = [
854  'tag' => $tagData,
855  'href' => $tagAttributes[0]['href'],
856  'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
857  ];
858  }
859  }
860  }
861  }
862  return $hyperLinksData;
863  }
864 
871  public function extractBaseHref($html)
872  {
873  $href = '';
874  $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
875  $htmlParts = $htmlParser->splitTags('base', $html);
876  foreach ($htmlParts as $index => $tagData) {
877  if ($index % 2 !== 0) {
878  $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
879  $firstTagName = $htmlParser->getFirstTagName($tagData);
880  if (strtolower($firstTagName) === 'base') {
881  $href = $tagAttributes[0]['href'];
882  if ($href) {
883  break;
884  }
885  }
886  }
887  }
888  return $href;
889  }
890 
891  /******************************************
892  *
893  * Indexing; external URL
894  *
895  ******************************************/
903  public function indexExternalUrl($externalUrl)
904  {
905  // Get headers:
906  $urlHeaders = $this->getUrlHeaders($externalUrl);
907  if (stristr($urlHeaders['Content-Type'], 'text/html')) {
908  $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
909  if ((string)$content !== '') {
910  // Create temporary file:
911  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
912  if ($tmpFile) {
913  GeneralUtility::writeFile($tmpFile, $content);
914  // Index that file:
915  $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
916  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
917  unlink($tmpFile);
918  }
919  }
920  }
921  }
922 
929  public function getUrlHeaders($url)
930  {
931  // Try to get the headers only
932  $content = GeneralUtility::getUrl($url, 2);
933  if ((string)$content !== '') {
934  // Compile headers:
935  $headers = GeneralUtility::trimExplode(LF, $content, true);
936  $retVal = [];
937  foreach ($headers as $line) {
938  if (trim($line) === '') {
939  break;
940  }
941  list($headKey, $headValue) = explode(':', $line, 2);
942  $retVal[$headKey] = $headValue;
943  }
944  return $retVal;
945  }
946  }
947 
954  protected function createLocalPath($sourcePath)
955  {
956  $localPath = '';
957  static $pathFunctions = [
958  'createLocalPathFromT3vars',
959  'createLocalPathUsingAbsRefPrefix',
960  'createLocalPathUsingDomainURL',
961  'createLocalPathFromAbsoluteURL',
962  'createLocalPathFromRelativeURL'
963  ];
964  foreach ($pathFunctions as $functionName) {
965  $localPath = $this->{$functionName}($sourcePath);
966  if ($localPath != '') {
967  break;
968  }
969  }
970  return $localPath;
971  }
972 
981  protected function createLocalPathFromT3vars($sourcePath)
982  {
983  $localPath = '';
984  $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
985  if (is_array($indexLocalFiles)) {
986  $md5 = GeneralUtility::shortMD5($sourcePath);
987  // Note: not using self::isAllowedLocalFile here because this method
988  // is allowed to index files outside of the web site (for example,
989  // protected downloads)
990  if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
991  $localPath = $indexLocalFiles[$md5];
992  }
993  }
994  return $localPath;
995  }
996 
1003  protected function createLocalPathUsingDomainURL($sourcePath)
1004  {
1005  $localPath = '';
1006  $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1007  $baseURLLength = strlen($baseURL);
1008  if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1009  $sourcePath = substr($sourcePath, $baseURLLength);
1010  $localPath = PATH_site . $sourcePath;
1011  if (!self::isAllowedLocalFile($localPath)) {
1012  $localPath = '';
1013  }
1014  }
1015  return $localPath;
1016  }
1017 
1025  protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1026  {
1027  $localPath = '';
1028  if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1029  $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1030  $absRefPrefixLength = strlen($absRefPrefix);
1031  if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1032  $sourcePath = substr($sourcePath, $absRefPrefixLength);
1033  $localPath = PATH_site . $sourcePath;
1034  if (!self::isAllowedLocalFile($localPath)) {
1035  $localPath = '';
1036  }
1037  }
1038  }
1039  return $localPath;
1040  }
1041 
1049  protected function createLocalPathFromAbsoluteURL($sourcePath)
1050  {
1051  $localPath = '';
1052  if ($sourcePath[0] == '/') {
1053  $sourcePath = substr($sourcePath, 1);
1054  $localPath = PATH_site . $sourcePath;
1055  if (!self::isAllowedLocalFile($localPath)) {
1056  $localPath = '';
1057  }
1058  }
1059  return $localPath;
1060  }
1061 
1068  protected function createLocalPathFromRelativeURL($sourcePath)
1069  {
1070  $localPath = '';
1071  if (self::isRelativeURL($sourcePath)) {
1072  $localPath = PATH_site . $sourcePath;
1073  if (!self::isAllowedLocalFile($localPath)) {
1074  $localPath = '';
1075  }
1076  }
1077  return $localPath;
1078  }
1079 
1086  protected static function isRelativeURL($url)
1087  {
1088  $urlParts = @parse_url($url);
1089  return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1090  }
1091 
1098  protected static function isAllowedLocalFile($filePath)
1099  {
1100  $filePath = GeneralUtility::resolveBackPath($filePath);
1101  $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1102  $isFile = is_file($filePath);
1103  return $insideWebPath && $isFile;
1104  }
1105 
1106  /******************************************
1107  *
1108  * Indexing; external files (PDF, DOC, etc)
1109  *
1110  ******************************************/
1120  public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1121  {
1122  // Init
1123  $fI = pathinfo($file);
1124  $ext = $altExtension ?: strtolower($fI['extension']);
1125  // Create abs-path:
1126  if (!$contentTmpFile) {
1127  if (!GeneralUtility::isAbsPath($file)) {
1128  // Relative, prepend PATH_site:
1129  $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1130  } else {
1131  // Absolute, pass-through:
1132  $absFile = $file;
1133  }
1134  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1135  } else {
1136  $absFile = $contentTmpFile;
1137  }
1138  // Indexing the document:
1139  if ($absFile && @is_file($absFile)) {
1140  if ($this->external_parsers[$ext]) {
1141  $fileInfo = stat($absFile);
1142  $cParts = $this->fileContentParts($ext, $absFile);
1143  foreach ($cParts as $cPKey) {
1144  $this->internal_log = [];
1145  $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1146  $Pstart = GeneralUtility::milliseconds();
1147  $subinfo = ['key' => $cPKey];
1148  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1149  $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1150  $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1151  if ($check > 0 || $force) {
1152  if ($check > 0) {
1153  $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1154  } else {
1155  $this->log_setTSlogMessage('Indexing forced by flag', 1);
1156  }
1157  // Check external file counter:
1158  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1159  // Divide into title,keywords,description and body:
1160  $this->log_push('Split content', '');
1161  $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1162  $this->log_pull();
1163  if (is_array($contentParts)) {
1164  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1166  if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1167  // Increment counter:
1168  $this->externalFileCounter++;
1169  // Splitting words
1170  $this->log_push('Extract words from content', '');
1171  $splitInWords = $this->processWordsInArrays($contentParts);
1172  $this->log_pull();
1173  // Analyse the indexed words.
1174  $this->log_push('Analyse the extracted words', '');
1175  $indexArr = $this->indexAnalyze($splitInWords);
1176  $this->log_pull();
1177  // Submitting page (phash) record
1178  $this->log_push('Submitting page', '');
1179  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1180  $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1181  $this->log_pull();
1182  // Check words and submit to word list if not there
1183  $this->log_push('Check word list and submit words', '');
1184  if (IndexedSearchUtility::isTableUsed('index_words')) {
1185  $this->checkWordList($indexArr);
1186  $this->submitWords($indexArr, $phash_arr['phash']);
1187  }
1188  $this->log_pull();
1189  // Set parsetime
1190  $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1191  } else {
1192  // Update the timestamp
1193  $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1194  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1195  }
1196  } else {
1197  $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1198  }
1199  } else {
1200  $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1201  }
1202  } else {
1203  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1204  }
1205  // Checking and setting sections:
1206  $this->submitFile_section($phash_arr['phash']);
1207  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1208  $this->log_pull();
1209  }
1210  } else {
1211  $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1212  }
1213  } else {
1214  $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1215  }
1216  }
1217 
1227  public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1228  {
1229  $contentArray = null;
1230  // Consult relevant external document parser:
1231  if (is_object($this->external_parsers[$fileExtension])) {
1232  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1233  }
1234  return $contentArray;
1235  }
1236 
1244  public function fileContentParts($ext, $absFile)
1245  {
1246  $cParts = [0];
1247  // Consult relevant external document parser:
1248  if (is_object($this->external_parsers[$ext])) {
1249  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1250  }
1251  return $cParts;
1252  }
1253 
1261  public function splitRegularContent($content)
1262  {
1263  $contentArr = $this->defaultContentArray;
1264  $contentArr['body'] = $content;
1265  return $contentArr;
1266  }
1267 
1268  /**********************************
1269  *
1270  * Analysing content, Extracting words
1271  *
1272  **********************************/
1280  public function charsetEntity2utf8(&$contentArr, $charset)
1281  {
1282  // Convert charset if necessary
1283  foreach ($contentArr as $key => $value) {
1284  if ((string)$contentArr[$key] !== '') {
1285  if ($charset !== 'utf-8') {
1286  $contentArr[$key] = $this->csObj->conv($contentArr[$key], $charset, 'utf-8');
1287  }
1288  // decode all numeric / html-entities in the string to real characters:
1289  $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key]);
1290  }
1291  }
1292  }
1293 
1300  public function processWordsInArrays($contentArr)
1301  {
1302  // split all parts to words
1303  foreach ($contentArr as $key => $value) {
1304  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1305  }
1306  // For title, keywords, and description we don't want duplicates:
1307  $contentArr['title'] = array_unique($contentArr['title']);
1308  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1309  $contentArr['description'] = array_unique($contentArr['description']);
1310  // Return modified array:
1311  return $contentArr;
1312  }
1313 
1320  public function bodyDescription($contentArr)
1321  {
1322  // Setting description
1323  $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1324  if ($maxL) {
1325  $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1326  // Shorten the string:
1327  $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1328  }
1329  return $bodyDescription;
1330  }
1331 
1338  public function indexAnalyze($content)
1339  {
1340  $indexArr = [];
1341  $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1342  $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1343  $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1344  $this->analyzeBody($indexArr, $content);
1345  return $indexArr;
1346  }
1347 
1357  public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1358  {
1359  foreach ($content[$key] as $val) {
1360  $val = substr($val, 0, 60);
1361  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1362  if (!isset($retArr[$val])) {
1363  // Word ID (wid)
1364  $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1365  // Metaphone value is also 60 only chars long
1366  $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1367  $retArr[$val]['metaphone'] = $metaphone;
1368  }
1369  // Build metaphone fulltext string (can be used for fulltext indexing)
1370  if ($this->storeMetaphoneInfoAsWords) {
1371  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1372  }
1373  // Priority used for flagBitMask feature (see extension configuration)
1374  $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1375  // Increase number of occurrences
1376  $retArr[$val]['count']++;
1377  $this->wordcount++;
1378  }
1379  }
1380 
1388  public function analyzeBody(&$retArr, $content)
1389  {
1390  foreach ($content['body'] as $key => $val) {
1391  $val = substr($val, 0, 60);
1392  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1393  if (!isset($retArr[$val])) {
1394  // First occurrence (used for ranking results)
1395  $retArr[$val]['first'] = $key;
1396  // Word ID (wid)
1397  $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1398  // Metaphone value is also only 60 chars long
1399  $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1400  $retArr[$val]['metaphone'] = $metaphone;
1401  }
1402  // Build metaphone fulltext string (can be used for fulltext indexing)
1403  if ($this->storeMetaphoneInfoAsWords) {
1404  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1405  }
1406  // Increase number of occurrences
1407  $retArr[$val]['count']++;
1408  $this->wordcount++;
1409  }
1410  }
1411 
1419  public function metaphone($word, $returnRawMetaphoneValue = false)
1420  {
1421  if (is_object($this->metaphoneObj)) {
1422  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1423  } else {
1424  // Use native PHP function instead of advanced doubleMetaphone class
1425  $metaphoneRawValue = metaphone($word);
1426  }
1427  if ($returnRawMetaphoneValue) {
1428  $result = $metaphoneRawValue;
1429  } elseif ($metaphoneRawValue !== '') {
1430  // Create hash and return integer
1431  $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1432  } else {
1433  $result = 0;
1434  }
1435  return $result;
1436  }
1437 
1438  /********************************
1439  *
1440  * SQL; TYPO3 Pages
1441  *
1442  *******************************/
1448  public function submitPage()
1449  {
1450  // Remove any current data for this phash:
1451  $this->removeOldIndexedPages($this->hash['phash']);
1452  // setting new phash_row
1453  $fields = [
1454  'phash' => $this->hash['phash'],
1455  'phash_grouping' => $this->hash['phash_grouping'],
1456  'cHashParams' => serialize($this->cHashParams),
1457  'contentHash' => $this->content_md5h,
1458  'data_page_id' => $this->conf['id'],
1459  'data_page_reg1' => $this->conf['page_cache_reg1'],
1460  'data_page_type' => $this->conf['type'],
1461  'data_page_mp' => $this->conf['MP'],
1462  'gr_list' => $this->conf['gr_list'],
1463  'item_type' => 0,
1464  // TYPO3 page
1465  'item_title' => $this->contentParts['title'],
1466  'item_description' => $this->bodyDescription($this->contentParts),
1467  'item_mtime' => (int)$this->conf['mtime'],
1468  'item_size' => strlen($this->conf['content']),
1469  'tstamp' => $GLOBALS['EXEC_TIME'],
1470  'crdate' => $GLOBALS['EXEC_TIME'],
1471  'item_crdate' => $this->conf['crdate'],
1472  // Creation date of page
1473  'sys_language_uid' => $this->conf['sys_language_uid'],
1474  // Sys language uid of the page. Should reflect which language it DOES actually display!
1475  'externalUrl' => 0,
1476  'recordUid' => (int)$this->conf['recordUid'],
1477  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1478  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1479  ];
1480  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1481  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1482  ->getConnectionForTable('index_phash');
1483  $connection->insert('index_phash', $fields);
1484  }
1485  // PROCESSING index_section
1486  $this->submit_section($this->hash['phash'], $this->hash['phash']);
1487  // PROCESSING index_grlist
1488  $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1489  // PROCESSING index_fulltext
1490  $fields = [
1491  'phash' => $this->hash['phash'],
1492  'fulltextdata' => implode(' ', $this->contentParts),
1493  'metaphonedata' => $this->metaphoneContent
1494  ];
1495  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1496  $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1497  }
1498  if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1499  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1500  ->getConnectionForTable('index_fulltext');
1501  $connection->insert('index_fulltext', $fields);
1502  }
1503  // PROCESSING index_debug
1504  if ($this->indexerConfig['debugMode']) {
1505  $fields = [
1506  'phash' => $this->hash['phash'],
1507  'debuginfo' => serialize([
1508  'cHashParams' => $this->cHashParams,
1509  'external_parsers initialized' => array_keys($this->external_parsers),
1510  'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1511  'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1512  'logs' => $this->internal_log,
1513  'lexer' => $this->lexerObj->debugString
1514  ])
1515  ];
1516  if (IndexedSearchUtility::isTableUsed('index_debug')) {
1517  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1518  ->getConnectionForTable('index_debug');
1519  $connection->insert('index_debug', $fields);
1520  }
1521  }
1522  }
1523 
1532  public function submit_grlist($hash, $phash_x)
1533  {
1534  // Setting the gr_list record
1535  $fields = [
1536  'phash' => $hash,
1537  'phash_x' => $phash_x,
1538  'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1539  'gr_list' => $this->conf['gr_list']
1540  ];
1541  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1542  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1543  ->getConnectionForTable('index_grlist');
1544  $connection->insert('index_grlist', $fields);
1545  }
1546  }
1547 
1556  public function submit_section($hash, $hash_t3)
1557  {
1558  $fields = [
1559  'phash' => $hash,
1560  'phash_t3' => $hash_t3,
1561  'page_id' => (int)$this->conf['id']
1562  ];
1563  $this->getRootLineFields($fields);
1564  if (IndexedSearchUtility::isTableUsed('index_section')) {
1565  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1566  ->getConnectionForTable('index_section');
1567  $connection->insert('index_section', $fields);
1568  }
1569  }
1570 
1577  public function removeOldIndexedPages($phash)
1578  {
1579  // Removing old registrations for all tables. Because the pages are TYPO3 pages
1580  // there can be nothing else than 1-1 relations here.
1581  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1582  $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1583  foreach ($tableArray as $table) {
1584  if (IndexedSearchUtility::isTableUsed($table)) {
1585  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1586  }
1587  }
1588 
1589  // Removing all index_section records with hash_t3 set to this hash (this includes such
1590  // records set for external media on the page as well!). The re-insert of these records
1591  // are done in indexRegularDocument($file).
1592  if (IndexedSearchUtility::isTableUsed('index_section')) {
1593  $connectionPool->getConnectionForTable('index_section')
1594  ->delete('index_section', ['phash_t3' => (int)$phash]);
1595  }
1596  }
1597 
1598  /********************************
1599  *
1600  * SQL; External media
1601  *
1602  *******************************/
1617  public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1618  {
1619  // Find item Type:
1620  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1621  $storeItemType = $storeItemType ?: $ext;
1622  // Remove any current data for this phash:
1623  $this->removeOldIndexedFiles($hash['phash']);
1624  // Split filename:
1625  $fileParts = parse_url($file);
1626  // Setting new
1627  $fields = [
1628  'phash' => $hash['phash'],
1629  'phash_grouping' => $hash['phash_grouping'],
1630  'cHashParams' => serialize($subinfo),
1631  'contentHash' => $content_md5h,
1632  'data_filename' => $file,
1633  'item_type' => $storeItemType,
1634  'item_title' => trim($contentParts['title']) ?: basename($file),
1635  'item_description' => $this->bodyDescription($contentParts),
1636  'item_mtime' => $mtime,
1637  'item_size' => $size,
1638  'item_crdate' => $ctime,
1639  'tstamp' => $GLOBALS['EXEC_TIME'],
1640  'crdate' => $GLOBALS['EXEC_TIME'],
1641  'gr_list' => $this->conf['gr_list'],
1642  'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1643  'recordUid' => (int)$this->conf['recordUid'],
1644  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1645  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1646  'sys_language_uid' => (int)$this->conf['sys_language_uid']
1647  ];
1648  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1649  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1650  ->getConnectionForTable('index_phash');
1651  $connection->insert('index_phash', $fields);
1652  }
1653  // PROCESSING index_fulltext
1654  $fields = [
1655  'phash' => $hash['phash'],
1656  'fulltextdata' => implode(' ', $contentParts),
1657  'metaphonedata' => $this->metaphoneContent
1658  ];
1659  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1660  $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1661  }
1662  if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1663  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1664  ->getConnectionForTable('index_fulltext');
1665  $connection->insert('index_fulltext', $fields);
1666  }
1667  // PROCESSING index_debug
1668  if ($this->indexerConfig['debugMode']) {
1669  $fields = [
1670  'phash' => $hash['phash'],
1671  'debuginfo' => serialize([
1672  'cHashParams' => $subinfo,
1673  'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1674  'logs' => $this->internal_log,
1675  'lexer' => $this->lexerObj->debugString
1676  ])
1677  ];
1678  if (IndexedSearchUtility::isTableUsed('index_debug')) {
1679  $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1680  ->getConnectionForTable('index_debug');
1681  $connection->insert('index_debug', $fields);
1682  }
1683  }
1684  }
1685 
1692  public function submitFile_grlist($hash)
1693  {
1694  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1695  if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1696  return;
1697  }
1698 
1699  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1700  ->getQueryBuilderForTable('index_grlist');
1701  $count = (int)$queryBuilder->count('*')
1702  ->from('index_grlist')
1703  ->where(
1704  $queryBuilder->expr()->eq(
1705  'phash',
1706  $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1707  ),
1708  $queryBuilder->expr()->orX(
1709  $queryBuilder->expr()->eq(
1710  'hash_gr_list',
1711  $queryBuilder->createNamedParameter(
1712  IndexedSearchUtility::md5inthash($this->defaultGrList),
1713  \PDO::PARAM_INT
1714  )
1715  ),
1716  $queryBuilder->expr()->eq(
1717  'hash_gr_list',
1718  $queryBuilder->createNamedParameter(
1719  IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1720  \PDO::PARAM_INT
1721  )
1722  )
1723  )
1724  )
1725  ->execute()
1726  ->fetchColumn();
1727 
1728  if ($count === 0) {
1729  $this->submit_grlist($hash, $hash);
1730  }
1731  }
1732 
1739  public function submitFile_section($hash)
1740  {
1741  // Testing if there is already a section
1742  if (!IndexedSearchUtility::isTableUsed('index_section')) {
1743  return;
1744  }
1745 
1746  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1747  ->getQueryBuilderForTable('index_section');
1748  $count = (int)$queryBuilder->count('phash')
1749  ->from('index_section')
1750  ->where(
1751  $queryBuilder->expr()->eq(
1752  'phash',
1753  $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1754  ),
1755  $queryBuilder->expr()->eq(
1756  'page_id',
1757  $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1758  )
1759  )
1760  ->execute()
1761  ->fetchColumn();
1762 
1763  if ($count === 0) {
1764  $this->submit_section($hash, $this->hash['phash']);
1765  }
1766  }
1767 
1774  public function removeOldIndexedFiles($phash)
1775  {
1776  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1777  // Removing old registrations for tables.
1778  $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1779  foreach ($tableArray as $table) {
1780  if (!IndexedSearchUtility::isTableUsed($table)) {
1781  continue;
1782  }
1783  $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1784  }
1785  }
1786 
1787  /********************************
1788  *
1789  * SQL Helper functions
1790  *
1791  *******************************/
1800  public function checkMtimeTstamp($mtime, $phash)
1801  {
1802  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1803  // Not indexed (not in index_phash)
1804  $result = 4;
1805  } else {
1806  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1807  ->select(
1808  ['item_mtime', 'tstamp'],
1809  'index_phash',
1810  ['phash' => (int)$phash],
1811  [],
1812  [],
1813  1
1814  )
1815  ->fetch();
1816  // If there was an indexing of the page...:
1817  if (!empty($row)) {
1818  if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1819  // If max age is exceeded, index the page
1820  // The configured max-age was exceeded for the document and thus it's indexed.
1821  $result = 1;
1822  } else {
1823  if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1824  // if minAge is not set or if minAge is exceeded, consider at mtime
1825  if ($mtime) {
1826  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1827  if ($row['item_mtime'] != $mtime) {
1828  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1829  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1830  $result = 2;
1831  } else {
1832  // mtime matched the document, so no changes detected and no content updated
1833  $result = -1;
1834  if ($this->tstamp_maxAge) {
1835  $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1836  } else {
1837  $this->updateTstamp($phash);
1838  $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1839  }
1840  }
1841  } else {
1842  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1843  $result = 3;
1844  }
1845  } else {
1846  // The minimum age was not exceeded
1847  $result = -2;
1848  }
1849  }
1850  } else {
1851  // Page has never been indexed (is not represented in the index_phash table).
1852  $result = 4;
1853  }
1854  }
1855  return $result;
1856  }
1857 
1863  public function checkContentHash()
1864  {
1865  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1866  $result = true;
1867  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1868  $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1869  ->select(
1870  ['phash'],
1871  'index_phash',
1872  [
1873  'phash_grouping' => (int)$this->hash['phash_grouping'],
1874  'contentHash' => (int)$this->content_md5h
1875  ],
1876  [],
1877  [],
1878  1
1879  )
1880  ->fetch();
1881 
1882  if (!empty($row)) {
1883  $result = $row;
1884  }
1885  }
1886  return $result;
1887  }
1888 
1897  public function checkExternalDocContentHash($hashGr, $content_md5h)
1898  {
1899  $result = true;
1900  if (IndexedSearchUtility::isTableUsed('index_phash')) {
1901  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1902  ->getConnectionForTable('index_phash')
1903  ->count(
1904  '*',
1905  'index_phash',
1906  [
1907  'phash_grouping' => (int)$hashGr,
1908  'contentHash' => (int)$content_md5h
1909  ]
1910  );
1911 
1912  $result = $count === 0;
1913  }
1914  return $result;
1915  }
1916 
1923  public function is_grlist_set($phash_x)
1924  {
1925  $result = false;
1926  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1927  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1928  ->getConnectionForTable('index_grlist')
1929  ->count(
1930  'phash_x',
1931  'index_grlist',
1932  ['phash_x' => (int)$phash_x]
1933  );
1934 
1935  $result = $count > 0;
1936  }
1937  return $result;
1938  }
1939 
1948  public function update_grlist($phash, $phash_x)
1949  {
1950  if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1951  $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1952  ->getConnectionForTable('index_grlist')
1953  ->count(
1954  'phash',
1955  'index_grlist',
1956  [
1957  'phash' => (int)$phash,
1958  'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1959  ]
1960  );
1961 
1962  if ($count === 0) {
1963  $this->submit_grlist($phash, $phash_x);
1964  $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1965  }
1966  }
1967  }
1968 
1976  public function updateTstamp($phash, $mtime = 0)
1977  {
1978  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1979  return;
1980  }
1981 
1982  $updateFields = [
1983  'tstamp' => $GLOBALS['EXEC_TIME']
1984  ];
1985 
1986  if ($mtime) {
1987  $updateFields['item_mtime'] = (int)$mtime;
1988  }
1989 
1990  GeneralUtility::makeInstance(ConnectionPool::class)
1991  ->getConnectionForTable('index_phash')
1992  ->update(
1993  'index_phash',
1994  $updateFields,
1995  [
1996  'phash' => (int)$phash
1997  ]
1998  );
1999  }
2000 
2007  public function updateSetId($phash)
2008  {
2009  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2010  return;
2011  }
2012 
2013  GeneralUtility::makeInstance(ConnectionPool::class)
2014  ->getConnectionForTable('index_phash')
2015  ->update(
2016  'index_phash',
2017  [
2018  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
2019  ],
2020  [
2021  'phash' => (int)$phash
2022  ]
2023  );
2024  }
2025 
2033  public function updateParsetime($phash, $parsetime)
2034  {
2035  if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2036  return;
2037  }
2038 
2039  GeneralUtility::makeInstance(ConnectionPool::class)
2040  ->getConnectionForTable('index_phash')
2041  ->update(
2042  'index_phash',
2043  [
2044  'parsetime' => (int)$parsetime
2045  ],
2046  [
2047  'phash' => (int)$phash
2048  ]
2049  );
2050  }
2051 
2057  public function updateRootline()
2058  {
2059  if (!IndexedSearchUtility::isTableUsed('index_section')) {
2060  return;
2061  }
2062 
2063  $updateFields = [];
2064  $this->getRootLineFields($updateFields);
2065 
2066  GeneralUtility::makeInstance(ConnectionPool::class)
2067  ->getConnectionForTable('index_section')
2068  ->update(
2069  'index_section',
2070  $updateFields,
2071  [
2072  'page_id' => (int)$this->conf['id']
2073  ]
2074  );
2075  }
2076 
2084  public function getRootLineFields(array &$fieldArray)
2085  {
2086  $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2087  $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2088  $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2089  if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
2090  foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
2091  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2092  }
2093  }
2094  }
2095 
2102  public function includeCrawlerClass()
2103  {
2105  require_once \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php';
2106  }
2107 
2108  /********************************
2109  *
2110  * SQL; Submitting words
2111  *
2112  *******************************/
2119  public function checkWordList($wordListArray)
2120  {
2121  if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2122  return;
2123  }
2124 
2125  $wordListArrayCount = count($wordListArray);
2126  $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2127 
2128  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2129  $count = (int)$queryBuilder->count('baseword')
2130  ->from('index_words')
2131  ->where(
2132  $queryBuilder->expr()->in(
2133  'wid',
2134  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2135  )
2136  )
2137  ->execute()
2138  ->fetchColumn();
2139 
2140  if ($count !== $wordListArrayCount) {
2141  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2142  $queryBuilder = $connection->createQueryBuilder();
2143 
2144  $result = $queryBuilder->select('baseword')
2145  ->from('index_words')
2146  ->where(
2147  $queryBuilder->expr()->in(
2148  'wid',
2149  $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2150  )
2151  )
2152  ->execute();
2153 
2154  $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2155  while ($row = $result->fetch()) {
2156  unset($wordListArray[$row['baseword']]);
2157  }
2158 
2159  foreach ($wordListArray as $key => $val) {
2160  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2161  // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2162  // this is not a problem.
2163  $connection->insert(
2164  'index_words',
2165  [
2166  'wid' => $val['hash'],
2167  'baseword' => $key,
2168  'metaphone' => $val['metaphone']
2169  ]
2170  );
2171  }
2172  }
2173  }
2174 
2182  public function submitWords($wordList, $phash)
2183  {
2184  if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2185  return;
2186  }
2187  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2188  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2189  $result = $queryBuilder->select('wid')
2190  ->from('index_words')
2191  ->where(
2192  $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2193  )
2194  ->groupBy('wid')
2195  ->execute();
2196 
2197  $stopWords = [];
2198  while ($row = $result->fetch()) {
2199  $stopWords[$row['wid']] = $row;
2200  }
2201 
2202  $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2203 
2204  $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2205  $rows = [];
2206  foreach ($wordList as $val) {
2207  if (isset($stopWords[$val['hash']])) {
2208  continue;
2209  }
2210  $rows[] = [
2211  (int)$phash,
2212  (int)$val['hash'],
2213  (int)$val['count'],
2214  (int)$val['first'],
2215  $this->freqMap($val['count'] / $this->wordcount),
2216  $val['cmp'] & $this->flagBitMask
2217  ];
2218  }
2219 
2220  $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2221  }
2222 
2230  public function freqMap($freq)
2231  {
2232  $mapFactor = $this->freqMax * 100 * $this->freqRange;
2233  if ($freq <= 1) {
2234  $newFreq = $freq * $mapFactor;
2235  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2236  } else {
2237  $newFreq = $freq / $mapFactor;
2238  }
2239  return $newFreq;
2240  }
2241 
2242  /********************************
2243  *
2244  * Hashing
2245  *
2246  *******************************/
2252  public function setT3Hashes()
2253  {
2254  // Set main array:
2255  $hArray = [
2256  'id' => (int)$this->conf['id'],
2257  'type' => (int)$this->conf['type'],
2258  'sys_lang' => (int)$this->conf['sys_language_uid'],
2259  'MP' => (string)$this->conf['MP'],
2260  'cHash' => $this->cHashParams
2261  ];
2262  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2263  $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2264  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2265  $hArray['gr_list'] = (string)$this->conf['gr_list'];
2266  $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2267  }
2268 
2276  public function setExtHashes($file, $subinfo = [])
2277  {
2278  // Set main array:
2279  $hash = [];
2280  $hArray = [
2281  'file' => $file
2282  ];
2283  // Set grouping hash:
2284  $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2285  // Add subinfo
2286  $hArray['subinfo'] = $subinfo;
2287  $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2288  return $hash;
2289  }
2290 
2291  /*********************************
2292  *
2293  * Internal logging functions
2294  *
2295  *********************************/
2303  public function log_push($msg, $key)
2304  {
2305  $this->timeTracker->push($msg, $key);
2306  }
2307 
2313  public function log_pull()
2314  {
2315  $this->timeTracker->pull();
2316  }
2317 
2325  public function log_setTSlogMessage($msg, $errorNum = 0)
2326  {
2327  $this->timeTracker->setTSlogMessage($msg, $errorNum);
2328  $this->internal_log[] = $msg;
2329  }
2330 
2339  protected function addSpacesToKeywordList($keywordList)
2340  {
2341  $keywords = GeneralUtility::trimExplode(',', $keywordList);
2342  return ' ' . implode(', ', $keywords) . ' ';
2343  }
2344 }
readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
static trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
static implodeArrayForUrl($name, array $theArray, $str= '', $skipBlank=false, $rawurlencodeParamName=false)
metaphone($word, $returnRawMetaphoneValue=false)
static writeFile($file, $content, $changePermissions=false)
submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
analyzeHeaderinfo(&$retArr, $content, $key, $offset)
static tempnam($filePrefix, $fileSuffix= '')
indexRegularDocument($file, $force=false, $contentTmpFile= '', $altExtension= '')
backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=[], $createCHash=false)
if(TYPO3_MODE=== 'BE') $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tsfebeuserauth.php']['frontendEditingController']['default']
static makeInstance($className,...$constructorArguments)
static forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:31
static getFileAbsFileName($filename, $_=null, $_2=null)
backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)