TYPO3 CMS  TYPO3_6-2
Indexer.php
Go to the documentation of this file.
1 <?php
3 
18 
29 class Indexer {
30 
31  // Messages:
35  public $reasons = array(
36  -1 => 'mtime matched the document, so no changes detected and no content updated',
37  -2 => 'The minimum age was not exceeded',
38  1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
39  2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
40  3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
41  4 => 'Page has never been indexed (is not represented in the index_phash table).'
42  );
43 
44  // HTML code blocks to exclude from indexing:
48  public $excludeSections = 'script,style';
49 
50  // Supported Extensions for external files:
54  public $external_parsers = array();
55 
56  // External parser objects, keys are file extension names. Values are objects with certain methods.
57  // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
61  public $defaultGrList = '0,-1';
62 
63  // Min/Max times:
67  public $tstamp_maxAge = 0;
68 
69  // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
73  public $tstamp_minAge = 0;
74 
75  // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
79  public $maxExternalFiles = 0;
80 
81  // Max number of external files to index.
85  public $forceIndexing = FALSE;
86 
87  // If TRUE, indexing is forced despite of hashes etc.
91  public $crawlerActive = FALSE;
92 
93  // Set when crawler is detected (internal)
94  // INTERNALS:
98  public $defaultContentArray = array(
99  'title' => '',
100  'description' => '',
101  'keywords' => '',
102  'body' => ''
103  );
104 
108  public $wordcount = 0;
109 
114 
118  public $conf = array();
119 
120  // Configuration set internally (see init functions for required keys and their meaning)
124  public $indexerConfig = array();
125 
126  // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
130  public $hash = array();
131 
132  // Hash array, contains phash and phash_grouping
136  public $file_phash_arr = array();
137 
138  // Hash array for files
142  public $contentParts = array();
143 
144  // Content of TYPO3 page
148  public $content_md5h = '';
149 
153  public $internal_log = array();
154 
155  // Internal log
160 
164  public $cHashParams = array();
165 
166  // cHashparams array
170  public $freqRange = 32000;
171 
175  public $freqMax = 0.1;
176 
180  public $enableMetaphoneSearch = FALSE;
181 
186 
190  public $metaphoneContent = '';
191 
192  // Objects:
199  public $csObj;
200 
208 
215  public $lexerObj;
216 
220  public $flagBitMask;
221 
229  public function hook_indexContent(&$pObj) {
230  // Indexer configuration from Extension Manager interface:
231  $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
232  // Crawler activation:
233  // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
234  if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
235  // Setting simple log message:
236  $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
237  // Setting variables:
238  $this->crawlerActive = TRUE;
239  // Crawler active flag
240  $this->forceIndexing = TRUE;
241  }
242  // Determine if page should be indexed, and if so, configure and initialize indexer
243  if ($pObj->config['config']['index_enable']) {
244  $this->log_push('Index page', '');
245  if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
246  if (!$pObj->page['no_search']) {
247  if (!$pObj->no_cache) {
248  if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
249  // Setting up internal configuration from config array:
250  $this->conf = array();
251  // Information about page for which the indexing takes place
252  $this->conf['id'] = $pObj->id;
253  // Page id
254  $this->conf['type'] = $pObj->type;
255  // Page type
256  $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
257  // sys_language UID of the language of the indexing.
258  $this->conf['MP'] = $pObj->MP;
259  // MP variable, if any (Mount Points)
260  $this->conf['gr_list'] = $pObj->gr_list;
261  // Group list
262  $this->conf['cHash'] = $pObj->cHash;
263  // cHash string for additional parameters
264  $this->conf['cHash_array'] = $pObj->cHash_array;
265  // Array of the additional parameters
266  $this->conf['crdate'] = $pObj->page['crdate'];
267  // The creation date of the TYPO3 page
268  $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
269  // reg1 of the caching table. Not known what practical use this has.
270  // Root line uids
271  $this->conf['rootline_uids'] = array();
272  foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
273  $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
274  }
275  // Content of page:
276  $this->conf['content'] = $pObj->content;
277  // Content string (HTML of TYPO3 page)
278  $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
279  // Alternative title for indexing
280  $this->conf['metaCharset'] = $pObj->metaCharset;
281  // Character set of content (will be converted to utf-8 during indexing)
282  $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
283  // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
284  // Configuration of behavior:
285  $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
286  // Whether to index external documents like PDF, DOC etc. (if possible)
287  $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
288  // Length of description text (max 250, default 200)
289  $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : TRUE;
290  // Set to zero:
291  $this->conf['recordUid'] = 0;
292  $this->conf['freeIndexUid'] = 0;
293  $this->conf['freeIndexSetId'] = 0;
294  // Init and start indexing:
295  $this->init();
296  $this->indexTypo3PageContent();
297  } else {
298  $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
299  }
300  } else {
301  $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
302  }
303  } else {
304  $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
305  }
306  } else {
307  $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
308  }
309  $this->log_pull();
310  }
311  }
312 
313  /****************************
314  *
315  * Backend API
316  *
317  ****************************/
331  public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = array(), $createCHash = FALSE) {
332  // Setting up internal configuration from config array:
333  $this->conf = array();
334  // Information about page for which the indexing takes place
335  $this->conf['id'] = $id;
336  // Page id (integer)
337  $this->conf['type'] = $type;
338  // Page type (integer)
339  $this->conf['sys_language_uid'] = $sys_language_uid;
340  // sys_language UID of the language of the indexing (integer)
341  $this->conf['MP'] = $MP;
342  // MP variable, if any (Mount Points) (string)
343  $this->conf['gr_list'] = '0,-1';
344  // Group list (hardcoded for now...)
345  // cHash values:
346  if ($createCHash) {
347  /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
348  $cacheHash = GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\CacheHashCalculator');
349  $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
350  } else {
351  $this->conf['cHash'] = '';
352  }
353  // cHash string for additional parameters
354  $this->conf['cHash_array'] = $cHash_array;
355  // Array of the additional parameters
356  // Set to defaults
357  $this->conf['freeIndexUid'] = 0;
358  $this->conf['freeIndexSetId'] = 0;
359  $this->conf['page_cache_reg1'] = '';
360  // Root line uids
361  $this->conf['rootline_uids'] = $uidRL;
362  // Configuration of behavior:
363  $this->conf['index_externals'] = 1;
364  // Whether to index external documents like PDF, DOC etc. (if possible)
365  $this->conf['index_descrLgd'] = 200;
366  // Length of description text (max 250, default 200)
367  $this->conf['index_metatags'] = TRUE;
368  // Whether to index document keywords and description (if present)
369  // Init and start indexing:
370  $this->init();
371  }
372 
381  public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0) {
382  $this->conf['freeIndexUid'] = $freeIndexUid;
383  $this->conf['freeIndexSetId'] = $freeIndexSetId;
384  }
385 
400  public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0) {
401  // Content of page:
402  $this->conf['mtime'] = $mtime;
403  // Most recent modification time (seconds) of the content
404  $this->conf['crdate'] = $crdate;
405  // The creation date of the TYPO3 content
406  $this->conf['recordUid'] = $recordUid;
407  // UID of the record, if applicable
408  // Construct fake HTML for parsing:
409  $this->conf['content'] = '
410  <html>
411  <head>
412  <title>' . htmlspecialchars($title) . '</title>
413  <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
414  <meta name="description" content="' . htmlspecialchars($description) . '" />
415  </head>
416  <body>
417  ' . htmlspecialchars($content) . '
418  </body>
419  </html>';
420  // Content string (HTML of TYPO3 page)
421  // Initializing charset:
422  $this->conf['metaCharset'] = $charset;
423  // Character set of content (will be converted to utf-8 during indexing)
424  $this->conf['indexedDocTitle'] = '';
425  // Alternative title for indexing
426  // Index content as if it was a TYPO3 page:
427  $this->indexTypo3PageContent();
428  }
429 
430  /********************************
431  *
432  * Initialization
433  *
434  *******************************/
441  public function init() {
442  global $TYPO3_CONF_VARS;
443  // Initializing:
444  $this->cHashParams = $this->conf['cHash_array'];
445  if (is_array($this->cHashParams) && count($this->cHashParams)) {
446  if ($this->conf['cHash']) {
447  // Add this so that URL's come out right...
448  $this->cHashParams['cHash'] = $this->conf['cHash'];
449  }
450  unset($this->cHashParams['encryptionKey']);
451  }
452  // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
453  $this->setT3Hashes();
454  // Indexer configuration from Extension Manager interface:
455  $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
456  $this->tstamp_minAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
457  $this->tstamp_maxAge = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
458  $this->maxExternalFiles = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
459  $this->flagBitMask = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
460  // Workaround: If the extension configuration was not updated yet, the value is not existing
461  $this->enableMetaphoneSearch = isset($this->indexerConfig['enableMetaphoneSearch']) ? ($this->indexerConfig['enableMetaphoneSearch'] ? TRUE : FALSE) : TRUE;
462  $this->storeMetaphoneInfoAsWords = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words') ? FALSE : ($this->enableMetaphoneSearch ? TRUE : FALSE);
463  // Initialize external document parsers:
464  // Example configuration, see ext_localconf.php of this file!
465  if ($this->conf['index_externals']) {
466  $this->initializeExternalParsers();
467  }
468  // Initialize lexer (class that deconstructs the text into words):
469  $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
470  $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
471  $this->lexerObj->debug = $this->indexerConfig['debugMode'];
472  // Initialize metaphone hook:
473  // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
474  if ($this->enableMetaphoneSearch && $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
475  $this->metaphoneObj = GeneralUtility::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
476  $this->metaphoneObj->pObj = $this;
477  }
478  // Init charset class:
479  $this->csObj = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Charset\\CharsetConverter');
480  }
481 
490  public function initializeExternalParsers() {
491  global $TYPO3_CONF_VARS;
492  if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
493  foreach ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
494  $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
495  $this->external_parsers[$extension]->pObj = $this;
496  // Init parser and if it returns FALSE, unset its entry again:
497  if (!$this->external_parsers[$extension]->initParser($extension)) {
498  unset($this->external_parsers[$extension]);
499  }
500  }
501  }
502  }
503 
504  /********************************
505  *
506  * Indexing; TYPO3 pages (HTML content)
507  *
508  *******************************/
515  public function indexTypo3PageContent() {
516  $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
517  $is_grlist = $this->is_grlist_set($this->hash['phash']);
518  if ($check > 0 || !$is_grlist || $this->forceIndexing) {
519  // Setting message:
520  if ($this->forceIndexing) {
521  $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
522  } elseif ($check > 0) {
523  $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
524  } else {
525  $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
526  }
527  // Divide into title,keywords,description and body:
528  $this->log_push('Split content', '');
529  $this->contentParts = $this->splitHTMLContent($this->conf['content']);
530  if ($this->conf['indexedDocTitle']) {
531  $this->contentParts['title'] = $this->conf['indexedDocTitle'];
532  }
533  $this->log_pull();
534  // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
535  $this->content_md5h = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
536  // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
537  // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
538  // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
539  $checkCHash = $this->checkContentHash();
540  if (!is_array($checkCHash) || $check === 1) {
541  $Pstart = GeneralUtility::milliseconds();
542  $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
543  $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
544  $this->log_pull();
545  // Splitting words
546  $this->log_push('Extract words from content', '');
547  $splitInWords = $this->processWordsInArrays($this->contentParts);
548  $this->log_pull();
549  // Analyse the indexed words.
550  $this->log_push('Analyse the extracted words', '');
551  $indexArr = $this->indexAnalyze($splitInWords);
552  $this->log_pull();
553  // Submitting page (phash) record
554  $this->log_push('Submitting page', '');
555  $this->submitPage();
556  $this->log_pull();
557  // Check words and submit to word list if not there
558  $this->log_push('Check word list and submit words', '');
559  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
560  $this->checkWordList($indexArr);
561  $this->submitWords($indexArr, $this->hash['phash']);
562  }
563  $this->log_pull();
564  // Set parsetime
565  $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
566  // Checking external files if configured for.
567  $this->log_push('Checking external files', '');
568  if ($this->conf['index_externals']) {
569  $this->extractLinks($this->conf['content']);
570  }
571  $this->log_pull();
572  } else {
573  // Update the timestamp
574  $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
575  $this->updateSetId($this->hash['phash']);
576  // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
577  $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
578  $this->updateRootline();
579  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
580  }
581  } else {
582  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
583  }
584  }
585 
594  public function splitHTMLContent($content) {
595  // divide head from body ( u-ouh :) )
596  $contentArr = $this->defaultContentArray;
597  $contentArr['body'] = stristr($content, '<body');
598  $headPart = substr($content, 0, -strlen($contentArr['body']));
599  // get title
600  $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
601  $titleParts = explode(':', $contentArr['title'], 2);
602  $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
603  // get keywords and description metatags
604  if ($this->conf['index_metatags']) {
605  $meta = array();
606  $i = 0;
607  while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
608  $i++;
609  }
610  // TODO The code below stops at first unset tag. Is that correct?
611  for ($i = 0; isset($meta[$i]); $i++) {
612  $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
613  if (stristr($meta[$i]['name'], 'keywords')) {
614  $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
615  }
616  if (stristr($meta[$i]['name'], 'description')) {
617  $contentArr['description'] .= ',' . $meta[$i]['content'];
618  }
619  }
620  }
621  // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
622  $this->typoSearchTags($contentArr['body']);
623  // Get rid of unwanted sections (ie. scripting and style stuff) in body
624  $tagList = explode(',', $this->excludeSections);
625  foreach ($tagList as $tag) {
626  while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
627 
628  }
629  }
630  // remove tags, but first make sure we don't concatenate words by doing it
631  $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
632  $contentArr['body'] = trim(strip_tags($contentArr['body']));
633  $contentArr['keywords'] = trim($contentArr['keywords']);
634  $contentArr['description'] = trim($contentArr['description']);
635  // Return array
636  return $contentArr;
637  }
638 
646  public function getHTMLcharset($content) {
647  if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
648  if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
649  return $reg2[1];
650  }
651  }
652  }
653 
662  public function convertHTMLToUtf8($content, $charset = '') {
663  // Find charset:
664  $charset = $charset ?: $this->getHTMLcharset($content);
665  $charset = $this->csObj->parse_charset($charset);
666  // Convert charset:
667  if ($charset && $charset !== 'utf-8') {
668  $content = $this->csObj->utf8_encode($content, $charset);
669  }
670  // Convert entities, assuming document is now UTF-8:
671  $content = $this->csObj->entities_to_utf8($content, TRUE);
672  return $content;
673  }
674 
688  public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) {
689  $endTag = '</' . $tagName . '>';
690  $startTag = '<' . $tagName;
691  // stristr used because we want a case-insensitive search for the tag.
692  $isTagInText = stristr($string, $startTag);
693  // if the tag was not found, return FALSE
694  if (!$isTagInText) {
695  return FALSE;
696  }
697  list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
698  $afterTagInText = stristr($isTagInText, $endTag);
699  if ($afterTagInText) {
700  $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
701  $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
702  $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
703  } else {
704  $tagContent = '';
705  $stringAfter = $isTagInText;
706  }
707  return TRUE;
708  }
709 
717  public function typoSearchTags(&$body) {
718  $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
719  if (count($expBody) > 1) {
720  $body = '';
721  foreach ($expBody as $val) {
722  $part = explode('-->', $val, 2);
723  if (trim($part[0]) == 'begin') {
724  $body .= $part[1];
725  $prev = '';
726  } elseif (trim($part[0]) == 'end') {
727  $body .= $prev;
728  } else {
729  $prev = $val;
730  }
731  }
732  return TRUE;
733  } else {
734  return FALSE;
735  }
736  }
737 
745  public function extractLinks($content) {
746  // Get links:
747  $list = $this->extractHyperLinks($content);
748  if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
749  $this->includeCrawlerClass();
750  $crawler = GeneralUtility::makeInstance('tx_crawler_lib');
751  }
752  // Traverse links:
753  foreach ($list as $linkInfo) {
754  // Decode entities:
755  if ($linkInfo['localPath']) {
756  // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
757  $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
758  } else {
759  $linkSource = htmlspecialchars_decode($linkInfo['href']);
760  }
761  // Parse URL:
762  $qParts = parse_url($linkSource);
763  // Check for jumpurl (TYPO3 specific thing...)
764  if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
765  parse_str($qParts['query'], $getP);
766  $linkSource = $getP['jumpurl'];
767  $qParts = parse_url($linkSource);
768  }
769  if (!$linkInfo['localPath'] && $qParts['scheme']) {
770  if ($this->indexerConfig['indexExternalURLs']) {
771  // Index external URL (http or otherwise)
772  $this->indexExternalUrl($linkSource);
773  }
774  } elseif (!$qParts['query']) {
775  $linkSource = urldecode($linkSource);
776  if (GeneralUtility::isAllowedAbsPath($linkSource)) {
777  $localFile = $linkSource;
778  } else {
779  $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
780  }
781  if ($localFile && @is_file($localFile)) {
782  // Index local file:
783  if ($linkInfo['localPath']) {
784  $fI = pathinfo($linkSource);
785  $ext = strtolower($fI['extension']);
786  if (is_object($crawler)) {
787  $params = array(
788  'document' => $linkSource,
789  'alturl' => $linkInfo['href'],
790  'conf' => $this->conf
791  );
792  unset($params['conf']['content']);
793  $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
794  $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
795  } else {
796  $this->indexRegularDocument($linkInfo['href'], FALSE, $linkSource, $ext);
797  }
798  } else {
799  if (is_object($crawler)) {
800  $params = array(
801  'document' => $linkSource,
802  'conf' => $this->conf
803  );
804  unset($params['conf']['content']);
805  $crawler->addQueueEntry_callBack(0, $params, '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerFilesHook', $this->conf['id']);
806  $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
807  } else {
808  $this->indexRegularDocument($linkSource);
809  }
810  }
811  }
812  }
813  }
814  }
815 
824  public function extractHyperLinks($html) {
825  $htmlParser = GeneralUtility::makeInstance('TYPO3\CMS\Core\Html\HtmlParser');
826  $htmlParts = $htmlParser->splitTags('a', $html);
827  $hyperLinksData = array();
828  foreach ($htmlParts as $index => $tagData) {
829  if ($index % 2 !== 0) {
830  $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
831  $firstTagName = $htmlParser->getFirstTagName($tagData);
832  if (strtolower($firstTagName) == 'a') {
833  if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] != '#') {
834  $hyperLinksData[] = array(
835  'tag' => $tagData,
836  'href' => $tagAttributes[0]['href'],
837  'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
838  );
839  }
840  }
841  }
842  }
843  return $hyperLinksData;
844  }
845 
852  public function extractBaseHref($html) {
853  $href = '';
854  $htmlParser = GeneralUtility::makeInstance('TYPO3\CMS\Core\Html\HtmlParser');
855  $htmlParts = $htmlParser->splitTags('base', $html);
856  foreach ($htmlParts as $index => $tagData) {
857  if ($index % 2 !== 0) {
858  $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
859  $firstTagName = $htmlParser->getFirstTagName($tagData);
860  if (strtolower($firstTagName) == 'base') {
861  $href = $tagAttributes[0]['href'];
862  if ($href) {
863  break;
864  }
865  }
866  }
867  }
868  return $href;
869  }
870 
871  /******************************************
872  *
873  * Indexing; external URL
874  *
875  ******************************************/
884  public function indexExternalUrl($externalUrl) {
885  // Parse External URL:
886  $qParts = parse_url($externalUrl);
887  $fI = pathinfo($qParts['path']);
888  $ext = strtolower($fI['extension']);
889  // Get headers:
890  $urlHeaders = $this->getUrlHeaders($externalUrl);
891  if (stristr($urlHeaders['Content-Type'], 'text/html')) {
892  $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
893  if (strlen($content)) {
894  // Create temporary file:
895  $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
896  if ($tmpFile) {
897  GeneralUtility::writeFile($tmpFile, $content);
898  // Index that file:
899  $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
900  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
901  unlink($tmpFile);
902  }
903  }
904  }
905  }
906 
915  public function getUrlHeaders($url) {
916  // Try to get the headers only
917  $content = GeneralUtility::getUrl($url, 2);
918  if (strlen($content)) {
919  // Compile headers:
920  $headers = GeneralUtility::trimExplode(LF, $content, TRUE);
921  $retVal = array();
922  foreach ($headers as $line) {
923  if (!strlen(trim($line))) {
924  break;
925  }
926  list($headKey, $headValue) = explode(':', $line, 2);
927  $retVal[$headKey] = $headValue;
928  }
929  return $retVal;
930  }
931  }
932 
939  protected function createLocalPath($sourcePath) {
940  $localPath = '';
941  static $pathFunctions = array(
942  'createLocalPathFromT3vars',
943  'createLocalPathUsingAbsRefPrefix',
944  'createLocalPathUsingDomainURL',
945  'createLocalPathFromAbsoluteURL',
946  'createLocalPathFromRelativeURL'
947  );
948  foreach ($pathFunctions as $functionName) {
949  $localPath = $this->{$functionName}($sourcePath);
950  if ($localPath != '') {
951  break;
952  }
953  }
954  return $localPath;
955  }
956 
965  protected function createLocalPathFromT3vars($sourcePath) {
966  $localPath = '';
967  $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
968  if (is_array($indexLocalFiles)) {
969  $md5 = GeneralUtility::shortMD5($sourcePath);
970  // Note: not using self::isAllowedLocalFile here because this method
971  // is allowed to index files outside of the web site (for example,
972  // protected downloads)
973  if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
974  $localPath = $indexLocalFiles[$md5];
975  }
976  }
977  return $localPath;
978  }
979 
986  protected function createLocalPathUsingDomainURL($sourcePath) {
987  $localPath = '';
988  $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
989  $baseURLLength = strlen($baseURL);
990  if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
991  $sourcePath = substr($sourcePath, $baseURLLength);
992  $localPath = PATH_site . $sourcePath;
993  if (!self::isAllowedLocalFile($localPath)) {
994  $localPath = '';
995  }
996  }
997  return $localPath;
998  }
999 
1007  protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
1008  $localPath = '';
1009  if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1010  $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1011  $absRefPrefixLength = strlen($absRefPrefix);
1012  if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1013  $sourcePath = substr($sourcePath, $absRefPrefixLength);
1014  $localPath = PATH_site . $sourcePath;
1015  if (!self::isAllowedLocalFile($localPath)) {
1016  $localPath = '';
1017  }
1018  }
1019  }
1020  return $localPath;
1021  }
1022 
1030  protected function createLocalPathFromAbsoluteURL($sourcePath) {
1031  $localPath = '';
1032  if ($sourcePath[0] == '/') {
1033  $sourcePath = substr($sourcePath, 1);
1034  $localPath = PATH_site . $sourcePath;
1035  if (!self::isAllowedLocalFile($localPath)) {
1036  $localPath = '';
1037  }
1038  }
1039  return $localPath;
1040  }
1041 
1048  protected function createLocalPathFromRelativeURL($sourcePath) {
1049  $localPath = '';
1050  if (self::isRelativeURL($sourcePath)) {
1051  $localPath = PATH_site . $sourcePath;
1052  if (!self::isAllowedLocalFile($localPath)) {
1053  $localPath = '';
1054  }
1055  }
1056  return $localPath;
1057  }
1058 
1065  static protected function isRelativeURL($url) {
1066  $urlParts = @parse_url($url);
1067  return $urlParts['scheme'] == '' && $urlParts['path'][0] != '/';
1068  }
1069 
1076  static protected function isAllowedLocalFile($filePath) {
1077  $filePath = GeneralUtility::resolveBackPath($filePath);
1078  $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1079  $isFile = is_file($filePath);
1080  return $insideWebPath && $isFile;
1081  }
1082 
1083  /******************************************
1084  *
1085  * Indexing; external files (PDF, DOC, etc)
1086  *
1087  ******************************************/
1098  public function indexRegularDocument($file, $force = FALSE, $contentTmpFile = '', $altExtension = '') {
1099  // Init
1100  $fI = pathinfo($file);
1101  $ext = $altExtension ?: strtolower($fI['extension']);
1102  // Create abs-path:
1103  if (!$contentTmpFile) {
1104  if (!GeneralUtility::isAbsPath($file)) {
1105  // Relative, prepend PATH_site:
1106  $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1107  } else {
1108  // Absolute, pass-through:
1109  $absFile = $file;
1110  }
1111  $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1112  } else {
1113  $absFile = $contentTmpFile;
1114  }
1115  // Indexing the document:
1116  if ($absFile && @is_file($absFile)) {
1117  if ($this->external_parsers[$ext]) {
1118  $fileInfo = stat($absFile);
1119  $cParts = $this->fileContentParts($ext, $absFile);
1120  foreach ($cParts as $cPKey) {
1121  $this->internal_log = array();
1122  $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1123  $Pstart = GeneralUtility::milliseconds();
1124  $subinfo = array('key' => $cPKey);
1125  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1126  $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1127  $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1128  if ($check > 0 || $force) {
1129  if ($check > 0) {
1130  $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1131  } else {
1132  $this->log_setTSlogMessage('Indexing forced by flag', 1);
1133  }
1134  // Check external file counter:
1135  if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1136  // Divide into title,keywords,description and body:
1137  $this->log_push('Split content', '');
1138  $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1139  $this->log_pull();
1140  if (is_array($contentParts)) {
1141  // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1143  if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1144  // Increment counter:
1145  $this->externalFileCounter++;
1146  // Splitting words
1147  $this->log_push('Extract words from content', '');
1148  $splitInWords = $this->processWordsInArrays($contentParts);
1149  $this->log_pull();
1150  // Analyse the indexed words.
1151  $this->log_push('Analyse the extracted words', '');
1152  $indexArr = $this->indexAnalyze($splitInWords);
1153  $this->log_pull();
1154  // Submitting page (phash) record
1155  $this->log_push('Submitting page', '');
1156  // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1157  $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1158  $this->log_pull();
1159  // Check words and submit to word list if not there
1160  $this->log_push('Check word list and submit words', '');
1161  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1162  $this->checkWordList($indexArr);
1163  $this->submitWords($indexArr, $phash_arr['phash']);
1164  }
1165  $this->log_pull();
1166  // Set parsetime
1167  $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1168  } else {
1169  // Update the timestamp
1170  $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1171  $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1172  }
1173  } else {
1174  $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1175  }
1176  } else {
1177  $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1178  }
1179  } else {
1180  $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1181  }
1182  // Checking and setting sections:
1183  $this->submitFile_section($phash_arr['phash']);
1184  // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1185  $this->log_pull();
1186  }
1187  } else {
1188  $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1189  }
1190  } else {
1191  $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1192  }
1193  }
1194 
1205  public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) {
1206  $contentArray = NULL;
1207  // Consult relevant external document parser:
1208  if (is_object($this->external_parsers[$fileExtension])) {
1209  $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1210  }
1211  return $contentArray;
1212  }
1213 
1222  public function fileContentParts($ext, $absFile) {
1223  $cParts = array(0);
1224  // Consult relevant external document parser:
1225  if (is_object($this->external_parsers[$ext])) {
1226  $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1227  }
1228  return $cParts;
1229  }
1230 
1239  public function splitRegularContent($content) {
1240  $contentArr = $this->defaultContentArray;
1241  $contentArr['body'] = $content;
1242  return $contentArr;
1243  }
1244 
1245  /**********************************
1246  *
1247  * Analysing content, Extracting words
1248  *
1249  **********************************/
1258  public function charsetEntity2utf8(&$contentArr, $charset) {
1259  // Convert charset if necessary
1260  foreach ($contentArr as $key => $value) {
1261  if (strlen($contentArr[$key])) {
1262  if ($charset !== 'utf-8') {
1263  $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
1264  }
1265  // decode all numeric / html-entities in the string to real characters:
1266  $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key], TRUE);
1267  }
1268  }
1269  }
1270 
1278  public function processWordsInArrays($contentArr) {
1279  // split all parts to words
1280  foreach ($contentArr as $key => $value) {
1281  $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1282  }
1283  // For title, keywords, and description we don't want duplicates:
1284  $contentArr['title'] = array_unique($contentArr['title']);
1285  $contentArr['keywords'] = array_unique($contentArr['keywords']);
1286  $contentArr['description'] = array_unique($contentArr['description']);
1287  // Return modified array:
1288  return $contentArr;
1289  }
1290 
1298  public function bodyDescription($contentArr) {
1299  // Setting description
1300  $maxL = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1301  if ($maxL) {
1302  $bodyDescription = str_replace(array(' ', TAB, CR, LF), ' ', $contentArr['body']);
1303  // Shorten the string:
1304  $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
1305  }
1306  return $bodyDescription;
1307  }
1308 
1316  public function indexAnalyze($content) {
1317  $indexArr = array();
1318  $counter = 0;
1319  $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1320  $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1321  $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1322  $this->analyzeBody($indexArr, $content);
1323  return $indexArr;
1324  }
1325 
1336  public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) {
1337  foreach ($content[$key] as $val) {
1338  $val = substr($val, 0, 60);
1339  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1340  if (!isset($retArr[$val])) {
1341  // Word ID (wid)
1343  // Metaphone value is also 60 only chars long
1344  $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1345  $retArr[$val]['metaphone'] = $metaphone;
1346  }
1347  // Build metaphone fulltext string (can be used for fulltext indexing)
1348  if ($this->storeMetaphoneInfoAsWords) {
1349  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1350  }
1351  // Priority used for flagBitMask feature (see extension configuration)
1352  $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1353  // Increase number of occurences
1354  $retArr[$val]['count']++;
1355  $this->wordcount++;
1356  }
1357  }
1358 
1367  public function analyzeBody(&$retArr, $content) {
1368  foreach ($content['body'] as $key => $val) {
1369  $val = substr($val, 0, 60);
1370  // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1371  if (!isset($retArr[$val])) {
1372  // First occurence (used for ranking results)
1373  $retArr[$val]['first'] = $key;
1374  // Word ID (wid)
1376  // Metaphone value is also only 60 chars long
1377  $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1378  $retArr[$val]['metaphone'] = $metaphone;
1379  }
1380  // Build metaphone fulltext string (can be used for fulltext indexing)
1381  if ($this->storeMetaphoneInfoAsWords) {
1382  $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1383  }
1384  // Increase number of occurences
1385  $retArr[$val]['count']++;
1386  $this->wordcount++;
1387  }
1388  }
1389 
1398  public function metaphone($word, $returnRawMetaphoneValue = FALSE) {
1399  if (is_object($this->metaphoneObj)) {
1400  $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1401  } else {
1402  // Use native PHP function instead of advanced doubleMetaphone class
1403  $metaphoneRawValue = metaphone($word);
1404  }
1405  if ($returnRawMetaphoneValue) {
1406  $result = $metaphoneRawValue;
1407  } elseif (strlen($metaphoneRawValue)) {
1408  // Create hash and return integer
1410  } else {
1411  $result = 0;
1412  }
1413  return $result;
1414  }
1415 
1416  /********************************
1417  *
1418  * SQL; TYPO3 Pages
1419  *
1420  *******************************/
1427  public function submitPage() {
1428  // Remove any current data for this phash:
1429  $this->removeOldIndexedPages($this->hash['phash']);
1430  // setting new phash_row
1431  $fields = array(
1432  'phash' => $this->hash['phash'],
1433  'phash_grouping' => $this->hash['phash_grouping'],
1434  'cHashParams' => serialize($this->cHashParams),
1435  'contentHash' => $this->content_md5h,
1436  'data_page_id' => $this->conf['id'],
1437  'data_page_reg1' => $this->conf['page_cache_reg1'],
1438  'data_page_type' => $this->conf['type'],
1439  'data_page_mp' => $this->conf['MP'],
1440  'gr_list' => $this->conf['gr_list'],
1441  'item_type' => 0,
1442  // TYPO3 page
1443  'item_title' => $this->contentParts['title'],
1444  'item_description' => $this->bodyDescription($this->contentParts),
1445  'item_mtime' => (int) $this->conf['mtime'],
1446  'item_size' => strlen($this->conf['content']),
1447  'tstamp' => $GLOBALS['EXEC_TIME'],
1448  'crdate' => $GLOBALS['EXEC_TIME'],
1449  'item_crdate' => $this->conf['crdate'],
1450  // Creation date of page
1451  'sys_language_uid' => $this->conf['sys_language_uid'],
1452  // Sys language uid of the page. Should reflect which language it DOES actually display!
1453  'externalUrl' => 0,
1454  'recordUid' => (int)$this->conf['recordUid'],
1455  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1456  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1457  );
1458  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1459  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1460  }
1461  // PROCESSING index_section
1462  $this->submit_section($this->hash['phash'], $this->hash['phash']);
1463  // PROCESSING index_grlist
1464  $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1465  // PROCESSING index_fulltext
1466  $fields = array(
1467  'phash' => $this->hash['phash'],
1468  'fulltextdata' => implode(' ', $this->contentParts),
1469  'metaphonedata' => $this->metaphoneContent
1470  );
1471  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1472  $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1473  }
1474  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1475  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1476  }
1477  // PROCESSING index_debug
1478  if ($this->indexerConfig['debugMode']) {
1479  $fields = array(
1480  'phash' => $this->hash['phash'],
1481  'debuginfo' => serialize(array(
1482  'cHashParams' => $this->cHashParams,
1483  'external_parsers initialized' => array_keys($this->external_parsers),
1484  'conf' => array_merge($this->conf, array('content' => substr($this->conf['content'], 0, 1000))),
1485  'contentParts' => array_merge($this->contentParts, array('body' => substr($this->contentParts['body'], 0, 1000))),
1486  'logs' => $this->internal_log,
1487  'lexer' => $this->lexerObj->debugString
1488  ))
1489  );
1490  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1491  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1492  }
1493  }
1494  }
1495 
1505  public function submit_grlist($hash, $phash_x) {
1506  // Setting the gr_list record
1507  $fields = array(
1508  'phash' => $hash,
1509  'phash_x' => $phash_x,
1510  'hash_gr_list' => \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1511  'gr_list' => $this->conf['gr_list']
1512  );
1513  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1514  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
1515  }
1516  }
1517 
1527  public function submit_section($hash, $hash_t3) {
1528  $fields = array(
1529  'phash' => $hash,
1530  'phash_t3' => $hash_t3,
1531  'page_id' => (int)$this->conf['id']
1532  );
1533  $this->getRootLineFields($fields);
1534  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1535  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
1536  }
1537  }
1538 
1546  public function removeOldIndexedPages($phash) {
1547  // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
1548  $tableArray = explode(',', 'index_phash,index_section,index_grlist,index_fulltext,index_debug');
1549  foreach ($tableArray as $table) {
1550  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1551  $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1552  }
1553  }
1554  // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
1555  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1556  $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3=' . (int)$phash);
1557  }
1558  }
1559 
1560  /********************************
1561  *
1562  * SQL; External media
1563  *
1564  *******************************/
1580  public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) {
1581  // Find item Type:
1582  $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1583  $storeItemType = $storeItemType ?: $ext;
1584  // Remove any current data for this phash:
1585  $this->removeOldIndexedFiles($hash['phash']);
1586  // Split filename:
1587  $fileParts = parse_url($file);
1588  // Setting new
1589  $fields = array(
1590  'phash' => $hash['phash'],
1591  'phash_grouping' => $hash['phash_grouping'],
1592  'cHashParams' => serialize($subinfo),
1593  'contentHash' => $content_md5h,
1594  'data_filename' => $file,
1595  'item_type' => $storeItemType,
1596  'item_title' => trim($contentParts['title']) ?: basename($file),
1597  'item_description' => $this->bodyDescription($contentParts),
1598  'item_mtime' => $mtime,
1599  'item_size' => $size,
1600  'item_crdate' => $ctime,
1601  'tstamp' => $GLOBALS['EXEC_TIME'],
1602  'crdate' => $GLOBALS['EXEC_TIME'],
1603  'gr_list' => $this->conf['gr_list'],
1604  'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1605  'recordUid' => (int)$this->conf['recordUid'],
1606  'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1607  'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1608  'sys_language_uid' => (int)$this->conf['sys_language_uid']
1609  );
1610  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1611  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
1612  }
1613  // PROCESSING index_fulltext
1614  $fields = array(
1615  'phash' => $hash['phash'],
1616  'fulltextdata' => implode(' ', $contentParts),
1617  'metaphonedata' => $this->metaphoneContent
1618  );
1619  if ($this->indexerConfig['fullTextDataLength'] > 0) {
1620  $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1621  }
1622  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_fulltext')) {
1623  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
1624  }
1625  // PROCESSING index_debug
1626  if ($this->indexerConfig['debugMode']) {
1627  $fields = array(
1628  'phash' => $hash['phash'],
1629  'debuginfo' => serialize(array(
1630  'cHashParams' => $subinfo,
1631  'contentParts' => array_merge($contentParts, array('body' => substr($contentParts['body'], 0, 1000))),
1632  'logs' => $this->internal_log,
1633  'lexer' => $this->lexerObj->debugString
1634  ))
1635  );
1636  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_debug')) {
1637  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
1638  }
1639  }
1640  }
1641 
1649  public function submitFile_grlist($hash) {
1650  // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1651  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1652  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$hash . ' AND (hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . ' OR hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']) . ')');
1653  if ($count == 0) {
1654  $this->submit_grlist($hash, $hash);
1655  }
1656  }
1657  }
1658 
1666  public function submitFile_section($hash) {
1667  // Testing if there is already a section
1668  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1669  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_section', 'phash=' . (int)$hash . ' AND page_id=' . (int)$this->conf['id']);
1670  if ($count == 0) {
1671  $this->submit_section($hash, $this->hash['phash']);
1672  }
1673  }
1674  }
1675 
1683  public function removeOldIndexedFiles($phash) {
1684  // Removing old registrations for tables.
1685  $tableArray = explode(',', 'index_phash,index_grlist,index_fulltext,index_debug');
1686  foreach ($tableArray as $table) {
1687  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed($table)) {
1688  $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$phash);
1689  }
1690  }
1691  }
1692 
1693  /********************************
1694  *
1695  * SQL Helper functions
1696  *
1697  *******************************/
1707  public function checkMtimeTstamp($mtime, $phash) {
1708  if (!\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1709  // Not indexed (not in index_phash)
1710  $result = 4;
1711  } else {
1712  $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('item_mtime,tstamp', 'index_phash', 'phash=' . (int)$phash);
1713  // If there was an indexing of the page...:
1714  if ($row) {
1715  if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1716  // If max age is exceeded, index the page
1717  // The configured max-age was exceeded for the document and thus it's indexed.
1718  $result = 1;
1719  } else {
1720  if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1721  // if minAge is not set or if minAge is exceeded, consider at mtime
1722  if ($mtime) {
1723  // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1724  if ($row['item_mtime'] != $mtime) {
1725  // And if mtime is different from the index_phash mtime, it's about time to re-index.
1726  // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1727  $result = 2;
1728  } else {
1729  // mtime matched the document, so no changes detected and no content updated
1730  $result = -1;
1731  if ($this->tstamp_maxAge) {
1732  $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1733  } else {
1734  $this->updateTstamp($phash);
1735  $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1736  }
1737  }
1738  } else {
1739  // The minimum age was exceed, but mtime was not set, so the page was indexed.
1740  $result = 3;
1741  }
1742  } else {
1743  // The minimum age was not exceeded
1744  $result = -2;
1745  }
1746  }
1747  } else {
1748  // Page has never been indexed (is not represented in the index_phash table).
1749  $result = 4;
1750  }
1751  }
1752  return $result;
1753  }
1754 
1761  public function checkContentHash() {
1762  // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1763  $result = TRUE;
1764  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1765  $row = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('phash', 'index_phash', 'phash_grouping=' . (int)$this->hash['phash_grouping'] . ' AND contentHash=' . (int)$this->content_md5h);
1766  if ($row) {
1767  $result = $row;
1768  }
1769  }
1770  return $result;
1771  }
1772 
1782  public function checkExternalDocContentHash($hashGr, $content_md5h) {
1783  $result = TRUE;
1784  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1785  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'index_phash', 'phash_grouping=' . (int)$hashGr . ' AND contentHash=' . (int)$content_md5h);
1786  $result = $count == 0;
1787  }
1788  return $result;
1789  }
1790 
1798  public function is_grlist_set($phash_x) {
1799  $result = FALSE;
1800  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1801  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash_x', 'index_grlist', 'phash_x=' . (int)$phash_x);
1802  $result = $count > 0;
1803  }
1804  return $result;
1805  }
1806 
1816  public function update_grlist($phash, $phash_x) {
1817  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1818  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('phash', 'index_grlist', 'phash=' . (int)$phash . ' AND hash_gr_list=' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->conf['gr_list']));
1819  if ($count == 0) {
1820  $this->submit_grlist($phash, $phash_x);
1821  $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1822  }
1823  }
1824  }
1825 
1834  public function updateTstamp($phash, $mtime = 0) {
1835  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1836  $updateFields = array(
1837  'tstamp' => $GLOBALS['EXEC_TIME']
1838  );
1839  if ($mtime) {
1840  $updateFields['item_mtime'] = (int)$mtime;
1841  }
1842  $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1843  }
1844  }
1845 
1853  public function updateSetId($phash) {
1854  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1855  $updateFields = array(
1856  'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1857  );
1858  $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1859  }
1860  }
1861 
1870  public function updateParsetime($phash, $parsetime) {
1871  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash')) {
1872  $updateFields = array(
1873  'parsetime' => (int)$parsetime
1874  );
1875  $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash=' . (int)$phash, $updateFields);
1876  }
1877  }
1878 
1885  public function updateRootline() {
1886  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_section')) {
1887  $updateFields = array();
1888  $this->getRootLineFields($updateFields);
1889  $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id=' . (int)$this->conf['id'], $updateFields);
1890  }
1891  }
1892 
1901  public function getRootLineFields(array &$fieldArray) {
1902  $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1903  $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1904  $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1905  if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
1906  foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
1907  $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1908  }
1909  }
1910  }
1911 
1920  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_phash') && \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_grlist')) {
1921  $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('A.phash', 'index_phash A,index_grlist B', '
1922  A.phash=B.phash
1923  AND A.phash_grouping=' . (int)$this->hash['phash_grouping'] . '
1924  AND B.hash_gr_list<>' . \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash($this->defaultGrList) . '
1925  AND A.contentHash=' . (int)$this->content_md5h);
1926  while ($res && FALSE !== ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1927  $this->log_setTSlogMessage('The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash=\'' . $row['phash'] . '\' are now removed.', 1);
1928  $this->removeOldIndexedPages($row['phash']);
1929  }
1930  $GLOBALS['TYPO3_DB']->sql_free_result($res);
1931  }
1932  }
1933 
1940  public function includeCrawlerClass() {
1941  GeneralUtility::requireOnce(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php');
1942  }
1943 
1944  /********************************
1945  *
1946  * SQL; Submitting words
1947  *
1948  *******************************/
1956  public function checkWordList($wordListArray) {
1957  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_words')) {
1958  if (count($wordListArray)) {
1959  $phashArray = array();
1960  foreach ($wordListArray as $value) {
1961  $phashArray[] = (int)$value['hash'];
1962  }
1963  $cwl = implode(',', $phashArray);
1964  $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1965  if ($count != count($wordListArray)) {
1966  $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN (' . $cwl . ')');
1967  $this->log_setTSlogMessage('Inserting words: ' . (count($wordListArray) - $count), 1);
1968  while (FALSE != ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))) {
1969  unset($wordListArray[$row['baseword']]);
1970  }
1971  $GLOBALS['TYPO3_DB']->sql_free_result($res);
1972  foreach ($wordListArray as $key => $val) {
1973  $insertFields = array(
1974  'wid' => $val['hash'],
1975  'baseword' => $key,
1976  'metaphone' => $val['metaphone']
1977  );
1978  // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
1979  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
1980  }
1981  }
1982  }
1983  }
1984  }
1985 
1994  public function submitWords($wordList, $phash) {
1995  if (\TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::isTableUsed('index_rel')) {
1996  $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash=' . (int)$phash);
1997  foreach ($wordList as $val) {
1998  $insertFields = array(
1999  'phash' => (int)$phash,
2000  'wid' => (int)$val['hash'],
2001  'count' => (int)$val['count'],
2002  'first' => (int)$val['first'],
2003  'freq' => $this->freqMap($val['count'] / $this->wordcount),
2004  'flags' => $val['cmp'] & $this->flagBitMask
2005  );
2006  $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
2007  }
2008  }
2009  }
2010 
2019  public function freqMap($freq) {
2020  $mapFactor = $this->freqMax * 100 * $this->freqRange;
2021  if ($freq <= 1) {
2022  $newFreq = $freq * $mapFactor;
2023  $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2024  } else {
2025  $newFreq = $freq / $mapFactor;
2026  }
2027  return $newFreq;
2028  }
2029 
2030  /********************************
2031  *
2032  * Hashing
2033  *
2034  *******************************/
2041  public function setT3Hashes() {
2042  // Set main array:
2043  $hArray = array(
2044  'id' => (int)$this->conf['id'],
2045  'type' => (int)$this->conf['type'],
2046  'sys_lang' => (int)$this->conf['sys_language_uid'],
2047  'MP' => (string) $this->conf['MP'],
2048  'cHash' => $this->cHashParams
2049  );
2050  // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2051  $this->hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2052  // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2053  $hArray['gr_list'] = (string) $this->conf['gr_list'];
2054  $this->hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2055  }
2056 
2065  public function setExtHashes($file, $subinfo = array()) {
2066  // Set main array:
2067  $hash = array();
2068  $hArray = array(
2069  'file' => $file
2070  );
2071  // Set grouping hash:
2072  $hash['phash_grouping'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2073  // Add subinfo
2074  $hArray['subinfo'] = $subinfo;
2075  $hash['phash'] = \TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility::md5inthash(serialize($hArray));
2076  return $hash;
2077  }
2078 
2079  /*********************************
2080  *
2081  * Internal logging functions
2082  *
2083  *********************************/
2092  public function log_push($msg, $key) {
2093  if (is_object($GLOBALS['TT'])) {
2094  $GLOBALS['TT']->push($msg, $key);
2095  }
2096  }
2097 
2104  public function log_pull() {
2105  if (is_object($GLOBALS['TT'])) {
2106  $GLOBALS['TT']->pull();
2107  }
2108  }
2109 
2118  public function log_setTSlogMessage($msg, $errorNum = 0) {
2119  if (is_object($GLOBALS['TT'])) {
2120  $GLOBALS['TT']->setTSlogMessage($msg, $errorNum);
2121  }
2122  $this->internal_log[] = $msg;
2123  }
2124 
2125  /**************************
2126  *
2127  * \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController hooks:
2128  *
2129  **************************/
2138  protected function addSpacesToKeywordList($keywordList) {
2139  $keywords = GeneralUtility::trimExplode(',', $keywordList);
2140  return ' ' . implode(', ', $keywords) . ' ';
2141  }
2142 
2143 }
backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
Definition: Indexer.php:331
analyzeHeaderinfo(&$retArr, $content, $key, $offset)
Definition: Indexer.php:1336
submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
Definition: Indexer.php:1580
submit_grlist($hash, $phash_x)
Definition: Indexer.php:1505
$TYPO3_CONF_VARS['SYS']['contentTable']
backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
Definition: Indexer.php:400
static writeFile($file, $content, $changePermissions=FALSE)
metaphone($word, $returnRawMetaphoneValue=FALSE)
Definition: Indexer.php:1398
static isAllowedLocalFile($filePath)
Definition: Indexer.php:1076
checkExternalDocContentHash($hashGr, $content_md5h)
Definition: Indexer.php:1782
convertHTMLToUtf8($content, $charset='')
Definition: Indexer.php:662
static forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:32
update_grlist($phash, $phash_x)
Definition: Indexer.php:1816
embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
Definition: Indexer.php:688
indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
Definition: Indexer.php:1098
static getUserObj($classRef, $checkPrefix='', $silent=FALSE)
createLocalPathFromRelativeURL($sourcePath)
Definition: Indexer.php:1048
createLocalPathFromT3vars($sourcePath)
Definition: Indexer.php:965
updateTstamp($phash, $mtime=0)
Definition: Indexer.php:1834
getRootLineFields(array &$fieldArray)
Definition: Indexer.php:1901
static trimExplode($delim, $string, $removeEmptyValues=FALSE, $limit=0)
addSpacesToKeywordList($keywordList)
Definition: Indexer.php:2138
submitWords($wordList, $phash)
Definition: Indexer.php:1994
fileContentParts($ext, $absFile)
Definition: Indexer.php:1222
setExtHashes($file, $subinfo=array())
Definition: Indexer.php:2065
updateParsetime($phash, $parsetime)
Definition: Indexer.php:1870
readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
Definition: Indexer.php:1205
if($list_of_literals) if(!empty($literals)) if(!empty($literals)) $result
Analyse literals to prepend the N char to them if their contents aren&#39;t numeric.
static getUrl($url, $includeHeader=0, $requestHeaders=FALSE, &$report=NULL)
static tempnam($filePrefix, $fileSuffix='')
analyzeBody(&$retArr, $content)
Definition: Indexer.php:1367
createLocalPathFromAbsoluteURL($sourcePath)
Definition: Indexer.php:1030
static implodeArrayForUrl($name, array $theArray, $str='', $skipBlank=FALSE, $rawurlencodeParamName=FALSE)
createLocalPathUsingDomainURL($sourcePath)
Definition: Indexer.php:986
charsetEntity2utf8(&$contentArr, $charset)
Definition: Indexer.php:1258
if(!defined('TYPO3_MODE')) $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_userauth.php']['logoff_pre_processing'][]
backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
Definition: Indexer.php:381
static getFileAbsFileName($filename, $onlyRelative=TRUE, $relToTYPO3_mainDir=FALSE)
checkMtimeTstamp($mtime, $phash)
Definition: Indexer.php:1707
submit_section($hash, $hash_t3)
Definition: Indexer.php:1527
log_setTSlogMessage($msg, $errorNum=0)
Definition: Indexer.php:2118
indexExternalUrl($externalUrl)
Definition: Indexer.php:884
checkWordList($wordListArray)
Definition: Indexer.php:1956
createLocalPathUsingAbsRefPrefix($sourcePath)
Definition: Indexer.php:1007