‪TYPO3CMS  10.4
CrawlerHook.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
31 
37 {
43  public ‪$secondsPerExternalUrl = 3;
44 
50  public ‪$instanceCounter = 0;
51 
55  public ‪$callBack = self::class;
56 
60  private ‪$pObj;
61 
69  public function ‪crawler_init(&‪$pObj)
70  {
71  $this->pObj = ‪$pObj;
72 
73  $message = null;
74  // Select all indexing configuration which are waiting to be activated:
75  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
76  $queryBuilder = $connection->createQueryBuilder();
77 
78  $result = $queryBuilder->select('*')
79  ->from('index_config')
80  ->where(
81  $queryBuilder->expr()->lt(
82  'timer_next_indexing',
83  $queryBuilder->createNamedParameter(‪$GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
84  ),
85  $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
86  )
87  ->execute();
88 
89  // For each configuration, check if it should be executed and if so, start:
90  while ($cfgRec = $result->fetch()) {
91  // Generate a unique set-ID:
92  $setId = GeneralUtility::md5int(microtime());
93  // Get next time:
94  $nextTime = $this->‪generateNextIndexingTime($cfgRec);
95  // Start process by updating index-config record:
96  $connection->update(
97  'index_config',
98  [
99  'set_id' => $setId,
100  'timer_next_indexing' => $nextTime,
101  'session_data' => ''
102  ],
103  [
104  'uid' => (int)$cfgRec['uid']
105  ]
106  );
107  // Based on configuration type:
108  switch ($cfgRec['type']) {
109  case 1:
110  // RECORDS:
111  // Parameters:
112  $params = [
113  'indexConfigUid' => $cfgRec['uid'],
114  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
115  'url' => 'Records (start)'
116  ];
117  //
118  ‪$pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
119  break;
120  case 2:
121  // FILES:
122  // Parameters:
123  $params = [
124  'indexConfigUid' => $cfgRec['uid'],
125  // General
126  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
127  // General
128  'url' => $cfgRec['filepath'],
129  // Partly general... (for URL and file types)
130  'depth' => 0
131  ];
132  ‪$pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
133  break;
134  case 3:
135  // External URL:
136  // Parameters:
137  $params = [
138  'indexConfigUid' => $cfgRec['uid'],
139  // General
140  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
141  // General
142  'url' => $cfgRec['externalUrl'],
143  // Partly general... (for URL and file types)
144  'depth' => 0
145  ];
146  ‪$pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
147  break;
148  case 4:
149  // Page tree
150  // Parameters:
151  $params = [
152  'indexConfigUid' => $cfgRec['uid'],
153  // General
154  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
155  // General
156  'url' => (int)$cfgRec['alternative_source_pid'],
157  // Partly general... (for URL and file types and page tree (root))
158  'depth' => 0
159  ];
160  ‪$pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
161  break;
162  case 5:
163  // Meta configuration, nothing to do:
164  // NOOP
165  break;
166  default:
167  if (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
168  $hookObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
169  // Parameters:
170  $params = [
171  'indexConfigUid' => $cfgRec['uid'],
172  // General
173  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
174  // General
175  'url' => $hookObj->initMessage($message)
176  ];
177  ‪$pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
178  }
179  }
180  }
181  // Finally, look up all old index configurations which are finished and needs to be reset and done.
183  }
184 
192  public function ‪crawler_execute($params, &‪$pObj)
193  {
194  // Indexer configuration ID must exist:
195  if ($params['indexConfigUid']) {
196  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
197  ->getQueryBuilderForTable('index_config');
198  $queryBuilder->getRestrictions()->removeAll();
199  // Load the indexing configuration record:
200  $cfgRec = $queryBuilder
201  ->select('*')
202  ->from('index_config')
203  ->where(
204  $queryBuilder->expr()->eq(
205  'uid',
206  $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
207  )
208  )
209  ->execute()
210  ->fetch();
211  if (is_array($cfgRec)) {
212  // Unpack session data:
213  $session_data = unserialize($cfgRec['session_data']);
214  // Select which type:
215  switch ($cfgRec['type']) {
216  case 1:
217  // Records:
218  $this->‪crawler_execute_type1($cfgRec, $session_data, $params, ‪$pObj);
219  break;
220  case 2:
221  // Files
222  $this->‪crawler_execute_type2($cfgRec, $session_data, $params, ‪$pObj);
223  break;
224  case 3:
225  // External URL:
226  $this->‪crawler_execute_type3($cfgRec, $session_data, $params, ‪$pObj);
227  break;
228  case 4:
229  // Page tree:
230  $this->‪crawler_execute_type4($cfgRec, $session_data, $params, ‪$pObj);
231  break;
232  case 5:
233  // Meta
234  // NOOP (should never enter here!)
235  break;
236  default:
237  if (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
238  $hookObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
239  $this->pObj = ‪$pObj;
240  // For addQueueEntryForHook()
241  $ref = $this; // introduced for phpstan to not lose type information when passing $this into callUserFunction
242  $hookObj->indexOperation($cfgRec, $session_data, $params, $ref);
243  }
244  }
245  // Save process data which might be modified:
246  GeneralUtility::makeInstance(ConnectionPool::class)
247  ->getConnectionForTable('index_config')
248  ->update(
249  'index_config',
250  ['session_data' => serialize($session_data)],
251  ['uid' => (int)$cfgRec['uid']]
252  );
253  }
254  }
255  return ['log' => $params];
256  }
257 
266  public function ‪crawler_execute_type1($cfgRec, &$session_data, $params, &‪$pObj)
267  {
268  if ($cfgRec['table2index'] && isset(‪$GLOBALS['TCA'][$cfgRec['table2index']])) {
269  // Init session data array if not already:
270  if (!is_array($session_data)) {
271  $session_data = [
272  'uid' => 0
273  ];
274  }
275  // Init:
276  $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
277  $numberOfRecords = $cfgRec['recordsbatch']
278  ? ‪MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
279  : 100;
280 
281  // Get root line:
282  $rootLine = $this->‪getUidRootLineForClosestTemplate($cfgRec['pid']);
283  // Select
284  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
285  ->getQueryBuilderForTable($cfgRec['table2index']);
286 
287  $baseQueryBuilder = $queryBuilder->select('*')
288  ->from($cfgRec['table2index'])
289  ->where(
290  $queryBuilder->expr()->eq(
291  'pid',
292  $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
293  ),
294  $queryBuilder->expr()->gt(
295  'uid',
296  $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
297  )
298  );
299  $result = $baseQueryBuilder
300  ->setMaxResults($numberOfRecords)
301  ->orderBy('uid')
302  ->execute();
303 
304  // Traverse:
305  while ($row = $result->fetch()) {
306  // Index single record:
307  $this->‪indexSingleRecord($row, $cfgRec, $rootLine);
308  // Update the UID we last processed:
309  $session_data['uid'] = $row['uid'];
310  }
311 
312  $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0);
313  // Finally, set entry for next indexing of batch of records:
314  if ($rowCount) {
315  $nparams = [
316  'indexConfigUid' => $cfgRec['uid'],
317  'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
318  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
319  ];
320  ‪$pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
321  }
322  }
323  }
324 
333  public function ‪crawler_execute_type2($cfgRec, &$session_data, $params, &‪$pObj)
334  {
335  // Prepare path, making it absolute and checking:
336  $readpath = $params['url'];
337  if (!GeneralUtility::isAbsPath($readpath)) {
338  $readpath = GeneralUtility::getFileAbsFileName($readpath);
339  }
340  if (GeneralUtility::isAllowedAbsPath($readpath)) {
341  if (@is_file($readpath)) {
342  // If file, index it!
343  // Get root line (need to provide this when indexing external files)
344  $rl = $this->‪getUidRootLineForClosestTemplate($cfgRec['pid']);
345  // (Re)-Indexing file on page.
346  $indexerObj = $this->‪initializeIndexer($cfgRec['pid'], 0, 0, '', $rl, $cfgRec['uid'], $cfgRec['set_id']);
347  $indexerObj->hash['phash'] = -1;
348  // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
349  // Index document:
350  $indexerObj->indexRegularDocument(‪PathUtility::stripPathSitePrefix($readpath), true);
351  } elseif (@is_dir($readpath)) {
352  // If dir, read content and create new pending items for log:
353  // Select files and directories in path:
354  $extList = implode(',', ‪GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
355  $fileArr = [];
356  $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, false, 0);
357  $directoryList = ‪GeneralUtility::get_dirs($readpath);
358  if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
359  foreach ($directoryList as $subdir) {
360  if ((string)$subdir != '') {
361  $files[] = $readpath . $subdir . '/';
362  }
363  }
364  }
365  $files = GeneralUtility::removePrefixPathFromList($files, ‪Environment::getPublicPath() . '/');
366  // traverse the items and create log entries:
367  foreach ($files as $path) {
368  $this->instanceCounter++;
369  if ($path !== $params['url']) {
370  // Parameters:
371  $nparams = [
372  'indexConfigUid' => $cfgRec['uid'],
373  'url' => $path,
374  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
375  'depth' => $params['depth'] + 1
376  ];
377  ‪$pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], ‪$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
378  }
379  }
380  }
381  }
382  }
383 
392  public function ‪crawler_execute_type3($cfgRec, &$session_data, $params, &‪$pObj)
393  {
394  // Init session data array if not already:
395  if (!is_array($session_data)) {
396  $session_data = [
397  'urlLog' => [$params['url']]
398  ];
399  }
400  // Index the URL:
401  $rl = $this->‪getUidRootLineForClosestTemplate($cfgRec['pid']);
402  $subUrls = $this->‪indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
403  // Add more elements to log now:
404  if ($params['depth'] < $cfgRec['depth']) {
405  foreach ($subUrls as $url) {
406  if ($url = $this->‪checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
407  if (!$this->‪checkDeniedSuburls($url, $cfgRec['url_deny'])) {
408  $this->instanceCounter++;
409  $session_data['urlLog'][] = $url;
410  // Parameters:
411  $nparams = [
412  'indexConfigUid' => $cfgRec['uid'],
413  'url' => $url,
414  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
415  'depth' => $params['depth'] + 1
416  ];
417  ‪$pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], ‪$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
418  }
419  }
420  }
421  }
422  }
423 
432  public function ‪crawler_execute_type4($cfgRec, &$session_data, $params, &‪$pObj)
433  {
434  // Base page uid:
435  $pageUid = (int)$params['url'];
436  // Get array of URLs from page:
437  $pageRow = ‪BackendUtility::getRecord('pages', $pageUid);
438  $res = ‪$pObj->getUrlsForPageRow($pageRow);
439  $duplicateTrack = [];
440  // Registry for duplicates
441  $downloadUrls = [];
442  // Dummy.
443  // Submit URLs:
444  if (!empty($res)) {
445  foreach ($res as $paramSetKey => $vv) {
446  ‪$pObj->urlListFromUrlArray($vv, $pageRow, ‪$GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
447  }
448  }
449  // Add subpages to log now:
450  if ($params['depth'] < $cfgRec['depth']) {
451  // Subpages selected
452  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
453  $queryBuilder->getRestrictions()
454  ->removeAll()
455  ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
456  $result = $queryBuilder->select('uid', 'title')
457  ->from('pages')
458  ->where(
459  $queryBuilder->expr()->eq(
460  'pid',
461  $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
462  )
463  )
464  ->execute();
465  // Traverse subpages and add to queue:
466  while ($row = $result->fetch()) {
467  $this->instanceCounter++;
468  $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
469  $session_data['urlLog'][] = $url;
470  // Parameters:
471  $nparams = [
472  'indexConfigUid' => $cfgRec['uid'],
473  'url' => $row['uid'],
474  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
475  'depth' => $params['depth'] + 1
476  ];
477  ‪$pObj->addQueueEntry_callBack(
478  $cfgRec['set_id'],
479  $nparams,
480  $this->callBack,
481  $cfgRec['pid'],
482  ‪$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
483  );
484  }
485  }
486  }
487 
491  public function ‪cleanUpOldRunningConfigurations()
492  {
493  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
494  // List of tables that store information related to the phash value
495  $tablesToClean = [
496  'index_phash',
497  'index_rel',
498  'index_section',
499  'index_grlist',
500  'index_fulltext',
501  'index_debug'
502  ];
503 
504  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
505  $queryBuilder->getRestrictions()
506  ->removeAll()
507  ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
508 
509  // Lookup running index configurations:
510  $runningIndexingConfigurations = $queryBuilder->select('*')
511  ->from('index_config')
512  ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
513  ->execute()
514  ->fetchAll();
515  // For each running configuration, look up how many log entries there are which are scheduled
516  // for execution and if none, clear the "set_id" (means; Processing was DONE)
517  foreach ($runningIndexingConfigurations as $cfgRec) {
518  // Look for ended processes:
519  $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
520  ->count(
521  '*',
522  'tx_crawler_queue',
523  [
524  'set_id' => (int)$cfgRec['set_id'],
525  'exec_time' => 0
526  ]
527  );
528  if (!$queued_items) {
529  // Lookup old phash rows:
530  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
531  $oldPhashRows = $queryBuilder
532  ->select('phash')
533  ->from('index_phash')
534  ->where(
535  $queryBuilder->expr()->eq(
536  'freeIndexUid',
537  $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
538  ),
539  $queryBuilder->expr()->neq(
540  'freeIndexSetId',
541  $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
542  )
543  )
544  ->execute()
545  ->fetchAll();
546 
547  // Removing old registrations for all tables
548  foreach ($tablesToClean as $table) {
549  $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
550  $queryBuilder->delete($table)
551  ->where(
552  $queryBuilder->expr()->in(
553  'phash',
554  $queryBuilder->createNamedParameter(
555  array_column($oldPhashRows, 'phash'),
556  Connection::PARAM_INT_ARRAY
557  )
558  )
559  )
560  ->execute();
561  }
562 
563  // End process by updating index-config record:
564  $connectionPool->getConnectionForTable('index_config')
565  ->update(
566  'index_config',
567  [
568  'set_id' => 0,
569  'session_data' => ''
570  ],
571  ['uid' => (int)$cfgRec['uid']]
572  );
573  }
574  }
575  }
576 
577  /*****************************************
578  *
579  * Helper functions
580  *
581  *****************************************/
590  public function ‪checkUrl($url, $urlLog, $baseUrl)
591  {
592  $url = (string)preg_replace('/\\/\\/$/', '/', $url);
593  [$url] = explode('#', $url);
594  if (strpos($url, '../') === false) {
595  if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
596  if (!in_array($url, $urlLog)) {
597  return $url;
598  }
599  }
600  }
601 
602  return '';
603  }
604 
615  public function ‪indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
616  {
617  // Index external URL:
618  $indexerObj = $this->‪initializeIndexer($pageId, 0, 0, '', $rl, [], $cfgUid, $setId);
619  $indexerObj->hash['phash'] = -1;
620  // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
621  $indexerObj->indexExternalUrl($url);
622  $url_qParts = parse_url($url);
623  $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
624  $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
625  if (!$baseHref) {
626  // Extract base href from current URL
627  $baseHref = $baseAbsoluteHref;
628  $baseHref .= substr($url_qParts['path'], 0, (int)strrpos($url_qParts['path'], '/'));
629  }
630  $baseHref = rtrim($baseHref, '/');
631  // Get URLs on this page:
632  $subUrls = [];
633  $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
634  // Traverse links:
635  foreach ($list as $count => $linkInfo) {
636  // Decode entities:
637  $subUrl = htmlspecialchars_decode($linkInfo['href']);
638  $qParts = parse_url($subUrl);
639  if (!$qParts['scheme']) {
640  $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
641  if ($relativeUrl[0] === '/') {
642  $subUrl = $baseAbsoluteHref . $relativeUrl;
643  } else {
644  $subUrl = $baseHref . '/' . $relativeUrl;
645  }
646  }
647  $subUrls[] = $subUrl;
648  }
649  return $subUrls;
650  }
651 
659  public function ‪indexSingleRecord($r, $cfgRec, $rl = null)
660  {
661  $rl = is_array($rl) ? $rl : $this->‪getUidRootLineForClosestTemplate($cfgRec['pid']);
662  $fieldList = ‪GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
663  $languageField = ‪$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
664  $sys_language_uid = $languageField ? $r[$languageField] : 0;
665  parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
666  // (Re)-Indexing a row from a table
667  $indexerObj = $this->‪initializeIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['uid'], $cfgRec['set_id']);
668  $indexerObj->forceIndexing = true;
669  $theContent = '';
670  $theTitle = '';
671  foreach ($fieldList as $k => $v) {
672  if (!$k) {
673  $theTitle = $r[$v];
674  } else {
675  $theContent .= $r[$v] . ' ';
676  }
677  }
678  // Indexing the record as a page (but with parameters set)
679  $this->‪indexAsTYPO3Page(
680  $indexerObj,
681  strip_tags(str_replace('<', ' <', $theTitle)),
682  strip_tags(str_replace('<', ' <', $theContent)),
683  $r[‪$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
684  $r[‪$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
685  $r['uid']
686  );
687  }
688 
696  public function ‪getUidRootLineForClosestTemplate($id)
697  {
698  $rootLineUids = [];
699  try {
700  // Gets the rootLine
701  $rootLine = GeneralUtility::makeInstance(RootlineUtility::class, $id)->get();
702  // This generates the constants/config + hierarchy info for the template.
703  $tmpl = GeneralUtility::makeInstance(ExtendedTemplateService::class);
704  $tmpl->runThroughTemplates($rootLine);
705  // Root line uids
706  foreach ($tmpl->rootLine as $rlkey => $rldat) {
707  $rootLineUids[$rlkey] = $rldat['uid'];
708  }
709  } catch (RootLineException $e) {
710  // do nothing
711  }
712  return $rootLineUids;
713  }
714 
721  public function ‪generateNextIndexingTime($cfgRec)
722  {
723  $currentTime = ‪$GLOBALS['EXEC_TIME'];
724  // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
725  if ($cfgRec['timer_frequency'] <= 24 * 3600) {
726  $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
727  } else {
728  $lastTime = $cfgRec['timer_next_indexing'] ?: ‪$GLOBALS['EXEC_TIME'];
729  $aMidNight = mktime(0, 0, 0, (int)date('m', $lastTime), (int)date('d', $lastTime), (int)date('y', $lastTime));
730  }
731  // Find last offset time plus frequency in seconds:
732  $lastSureOffset = $aMidNight + ‪MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
733  $frequencySeconds = ‪MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
734  // Now, find out how many blocks of the length of frequency there is until the next time:
735  $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
736  // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
737  return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
738  }
739 
747  public function ‪checkDeniedSuburls($url, $url_deny)
748  {
749  if (trim($url_deny)) {
750  $url_denyArray = ‪GeneralUtility::trimExplode(LF, $url_deny, true);
751  foreach ($url_denyArray as $testurl) {
752  if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
753  return true;
754  }
755  }
756  }
757  return false;
758  }
759 
766  public function ‪addQueueEntryForHook($cfgRec, $title)
767  {
768  $nparams = [
769  'indexConfigUid' => $cfgRec['uid'],
770  // This must ALWAYS be the cfgRec uid!
771  'url' => $title,
772  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
773  ];
774  $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
775  }
776 
782  public function ‪deleteFromIndex($id)
783  {
784  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
785 
786  // Lookup old phash rows:
787 
788  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
789  $oldPhashRows = $queryBuilder->select('phash')
790  ->from('index_section')
791  ->where(
792  $queryBuilder->expr()->eq(
793  'page_id',
794  $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
795  )
796  )
797  ->execute()
798  ->fetchAll();
799 
800  if (empty($oldPhashRows)) {
801  return;
802  }
803 
804  $tables = [
805  'index_debug',
806  'index_fulltext',
807  'index_grlist',
808  'index_phash',
809  'index_rel',
810  'index_section',
811  ];
812  foreach ($tables as $table) {
813  $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
814  $queryBuilder->delete($table)
815  ->where(
816  $queryBuilder->expr()->in(
817  'phash',
818  $queryBuilder->createNamedParameter(
819  array_column($oldPhashRows, 'phash'),
820  Connection::PARAM_INT_ARRAY
821  )
822  )
823  )
824  ->execute();
825  }
826  }
827 
828  /*************************
829  *
830  * Hook functions for DataHandler (indexing of records)
831  *
832  *************************/
842  public function ‪processCmdmap_preProcess($command, $table, $id, $value, ‪$pObj)
843  {
844  // Clean up the index
845  if ($command === 'delete' && $table === 'pages') {
846  $this->‪deleteFromIndex((int)$id);
847  }
848  }
849 
859  public function ‪processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, ‪$pObj)
860  {
861  // Check if any fields are actually updated:
862  if (empty($fieldArray)) {
863  return;
864  }
865  // Translate new ids.
866  if ($status === 'new') {
867  $id = ‪$pObj->substNEWwithIDs[$id];
868  } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
869  // If the page should be hidden or not indexed after update, delete index for this page
870  $this->‪deleteFromIndex((int)$id);
871  }
872  // Get full record and if exists, search for indexing configurations:
873  $currentRecord = ‪BackendUtility::getRecord($table, $id);
874  if (is_array($currentRecord)) {
875  // Select all (not running) indexing configurations of type "record" (1) and
876  // which points to this table and is located on the same page as the record
877  // or pointing to the right source PID
878  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
879  ->getQueryBuilderForTable('index_config');
880  $result = $queryBuilder->select('*')
881  ->from('index_config')
882  ->where(
883  $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
884  $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
885  $queryBuilder->expr()->eq(
886  'table2index',
887  $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
888  ),
889  $queryBuilder->expr()->orX(
890  $queryBuilder->expr()->andX(
891  $queryBuilder->expr()->eq(
892  'alternative_source_pid',
893  $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
894  ),
895  $queryBuilder->expr()->eq(
896  'pid',
897  $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
898  )
899  ),
900  $queryBuilder->expr()->eq(
901  'alternative_source_pid',
902  $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
903  )
904  ),
905  $queryBuilder->expr()->eq(
906  'records_indexonchange',
907  $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
908  )
909  )
910  ->execute();
911 
912  while ($cfgRec = $result->fetch()) {
913  $this->‪indexSingleRecord($currentRecord, $cfgRec);
914  }
915  }
916  }
917 
931  protected function ‪initializeIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $queryArguments = [], $freeIndexUid = 0, $freeIndexSetId = 0): Indexer
932  {
933  $indexerObj = GeneralUtility::makeInstance(Indexer::class);
934  // Setting up internal configuration from config array:
935  // Information about page for which the indexing takes place
936  $configuration = [
937  // Page id (int)
938  'id' => $id,
939  // Page type (int)
940  'type' => $type,
941  // sys_language UID of the language of the indexing (int)
942  'sys_language_uid' => $sys_language_uid,
943  // MP variable, if any (Mount Points) (string)
944  'MP' => $MP,
945  // Group list (hardcoded for now...)
946  'gr_list' => '0,-1',
947  'staticPageArguments' => $queryArguments,
948  // Set to defaults
949  'freeIndexUid' => $freeIndexUid,
950  'freeIndexSetId' => $freeIndexSetId,
951  // Root line uids
952  'rootline_uids' => $uidRL,
953 
954  // Configuration of behavior
955  // Whether to index external documents like PDF, DOC etc. (if possible)
956  'index_externals' => 1,
957  // Length of description text (max 250, default 200)
958  'index_descrLgd' => 200,
959  // Whether to index document keywords and description (if present)
960  'index_metatags' => true
961  ];
962  $indexerObj->init($configuration);
963  return $indexerObj;
964  }
965 
976  protected function ‪indexAsTYPO3Page(Indexer $indexer, $title, $content, $mtime, $crdate = 0, $recordUid = 0)
977  {
978  // Content of page:
979  $indexer->conf['mtime'] = $mtime;
980  // Most recent modification time (seconds) of the content
981  $indexer->conf['crdate'] = $crdate;
982  // The creation date of the TYPO3 content
983  $indexer->conf['recordUid'] = $recordUid;
984  // UID of the record, if applicable
985  // Construct fake HTML for parsing:
986  $indexer->conf['content'] = '
987  <html>
988  <head>
989  <title>' . htmlspecialchars($title) . '</title>
990  </head>
991  <body>
992  ' . htmlspecialchars($content) . '
993  </body>
994  </html>';
995  // Content string (HTML of TYPO3 page)
996  // Initializing charset:
997  $indexer->conf['metaCharset'] = 'utf-8';
998  // Character set of content (will be converted to utf-8 during indexing)
999  $indexer->conf['indexedDocTitle'] = '';
1000  // Alternative title for indexing
1001  // Index content as if it was a TYPO3 page:
1002  $indexer->indexTypo3PageContent();
1003  }
1004 }
‪TYPO3\CMS\Core\DataHandling\DataHandler
Definition: DataHandler.php:84
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\checkUrl
‪string checkUrl($url, $urlLog, $baseUrl)
Definition: CrawlerHook.php:586
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:24
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\processDatamap_afterDatabaseOperations
‪processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
Definition: CrawlerHook.php:855
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static string getPublicPath()
Definition: Environment.php:180
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook
Definition: CrawlerHook.php:37
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute
‪array crawler_execute($params, &$pObj)
Definition: CrawlerHook.php:188
‪TYPO3\CMS\Core\Utility\PathUtility\stripPathSitePrefix
‪static string stripPathSitePrefix($path)
Definition: PathUtility.php:372
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\indexSingleRecord
‪indexSingleRecord($r, $cfgRec, $rl=null)
Definition: CrawlerHook.php:655
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:32
‪TYPO3\CMS\Core\Utility\RootlineUtility
Definition: RootlineUtility.php:39
‪TYPO3\CMS\Core\TypoScript\ExtendedTemplateService
Definition: ExtendedTemplateService.php:43
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute_type3
‪crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
Definition: CrawlerHook.php:388
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\$secondsPerExternalUrl
‪int $secondsPerExternalUrl
Definition: CrawlerHook.php:42
‪TYPO3\CMS\Core\Utility\GeneralUtility\get_dirs
‪static string[] string null get_dirs($path)
Definition: GeneralUtility.php:2170
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\$instanceCounter
‪int $instanceCounter
Definition: CrawlerHook.php:48
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\indexExtUrl
‪array indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
Definition: CrawlerHook.php:611
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\cleanUpOldRunningConfigurations
‪cleanUpOldRunningConfigurations()
Definition: CrawlerHook.php:487
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute_type4
‪crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
Definition: CrawlerHook.php:428
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\addQueueEntryForHook
‪addQueueEntryForHook($cfgRec, $title)
Definition: CrawlerHook.php:762
‪TYPO3\CMS\Backend\Utility\BackendUtility
Definition: BackendUtility.php:75
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\$callBack
‪string $callBack
Definition: CrawlerHook.php:52
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute_type2
‪crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
Definition: CrawlerHook.php:329
‪TYPO3\CMS\Backend\Utility\BackendUtility\getRecord
‪static array null getRecord($table, $uid, $fields=' *', $where='', $useDeleteClause=true)
Definition: BackendUtility.php:95
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\deleteFromIndex
‪deleteFromIndex($id)
Definition: CrawlerHook.php:778
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static string[] trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
Definition: GeneralUtility.php:1059
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\processCmdmap_preProcess
‪processCmdmap_preProcess($command, $table, $id, $value, $pObj)
Definition: CrawlerHook.php:838
‪TYPO3\CMS\IndexedSearch\Indexer\indexTypo3PageContent
‪indexTypo3PageContent()
Definition: Indexer.php:275
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:36
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute_type1
‪crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
Definition: CrawlerHook.php:262
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:5
‪TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction
Definition: DeletedRestriction.php:28
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\$pObj
‪object $pObj
Definition: CrawlerHook.php:56
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:40
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\initializeIndexer
‪Indexer initializeIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $queryArguments=[], $freeIndexUid=0, $freeIndexSetId=0)
Definition: CrawlerHook.php:927
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:22
‪TYPO3\CMS\Core\Exception\Page\RootLineException
Definition: RootLineException.php:25
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\generateNextIndexingTime
‪int generateNextIndexingTime($cfgRec)
Definition: CrawlerHook.php:717
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:46
‪TYPO3\CMS\IndexedSearch\Indexer
Definition: Indexer.php:37
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:46
‪TYPO3\CMS\IndexedSearch\Hook
Definition: CrawlerFilesHook.php:16
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\checkDeniedSuburls
‪bool checkDeniedSuburls($url, $url_deny)
Definition: CrawlerHook.php:743
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\indexAsTYPO3Page
‪indexAsTYPO3Page(Indexer $indexer, $title, $content, $mtime, $crdate=0, $recordUid=0)
Definition: CrawlerHook.php:972
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_init
‪crawler_init(&$pObj)
Definition: CrawlerHook.php:65
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\getUidRootLineForClosestTemplate
‪array getUidRootLineForClosestTemplate($id)
Definition: CrawlerHook.php:692