‪TYPO3CMS  9.5
CrawlerHook.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
27 
33 {
39  public ‪$secondsPerExternalUrl = 3;
40 
46  public ‪$instanceCounter = 0;
47 
51  public ‪$callBack = self::class;
52 
56  public function ‪__construct()
57  {
58  // To make sure the backend charset is available:
59  if (!is_object(‪$GLOBALS['LANG'])) {
60  ‪$GLOBALS['LANG'] = GeneralUtility::makeInstance(\‪TYPO3\CMS\Core\Localization\LanguageService::class);
61  ‪$GLOBALS['LANG']->init(‪$GLOBALS['BE_USER']->uc['lang']);
62  }
63  }
64 
72  public function ‪crawler_init(&$pObj)
73  {
74  // Select all indexing configuration which are waiting to be activated:
75  $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
76  $queryBuilder = $connection->createQueryBuilder();
77 
78  $result = $queryBuilder->select('*')
79  ->from('index_config')
80  ->where(
81  $queryBuilder->expr()->lt(
82  'timer_next_indexing',
83  $queryBuilder->createNamedParameter(‪$GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
84  ),
85  $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
86  )
87  ->execute();
88 
89  // For each configuration, check if it should be executed and if so, start:
90  while ($cfgRec = $result->fetch()) {
91  // Generate a unique set-ID:
92  $setId = GeneralUtility::md5int(microtime());
93  // Get next time:
94  $nextTime = $this->‪generateNextIndexingTime($cfgRec);
95  // Start process by updating index-config record:
96  $connection->update(
97  'index_config',
98  [
99  'set_id' => $setId,
100  'timer_next_indexing' => $nextTime,
101  'session_data' => ''
102  ],
103  [
104  'uid' => (int)$cfgRec['uid']
105  ]
106  );
107  // Based on configuration type:
108  switch ($cfgRec['type']) {
109  case 1:
110  // RECORDS:
111  // Parameters:
112  $params = [
113  'indexConfigUid' => $cfgRec['uid'],
114  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
115  'url' => 'Records (start)'
116  ];
117  //
118  $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
119  break;
120  case 2:
121  // FILES:
122  // Parameters:
123  $params = [
124  'indexConfigUid' => $cfgRec['uid'],
125  // General
126  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
127  // General
128  'url' => $cfgRec['filepath'],
129  // Partly general... (for URL and file types)
130  'depth' => 0
131  ];
132  $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
133  break;
134  case 3:
135  // External URL:
136  // Parameters:
137  $params = [
138  'indexConfigUid' => $cfgRec['uid'],
139  // General
140  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
141  // General
142  'url' => $cfgRec['externalUrl'],
143  // Partly general... (for URL and file types)
144  'depth' => 0
145  ];
146  $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
147  break;
148  case 4:
149  // Page tree
150  // Parameters:
151  $params = [
152  'indexConfigUid' => $cfgRec['uid'],
153  // General
154  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
155  // General
156  'url' => (int)$cfgRec['alternative_source_pid'],
157  // Partly general... (for URL and file types and page tree (root))
158  'depth' => 0
159  ];
160  $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
161  break;
162  case 5:
163  // Meta configuration, nothing to do:
164  // NOOP
165  break;
166  default:
167  if (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
168  $hookObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
169  // Parameters:
170  $params = [
171  'indexConfigUid' => $cfgRec['uid'],
172  // General
173  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
174  // General
175  'url' => $hookObj->initMessage($message)
176  ];
177  $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
178  }
179  }
180  }
181  // Finally, look up all old index configurations which are finished and needs to be reset and done.
183  }
184 
192  public function ‪crawler_execute($params, &$pObj)
193  {
194  // Indexer configuration ID must exist:
195  if ($params['indexConfigUid']) {
196  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
197  ->getQueryBuilderForTable('index_config');
198  $queryBuilder->getRestrictions()->removeAll();
199  // Load the indexing configuration record:
200  $cfgRec = $queryBuilder
201  ->select('*')
202  ->from('index_config')
203  ->where(
204  $queryBuilder->expr()->eq(
205  'uid',
206  $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
207  )
208  )
209  ->execute()
210  ->fetch();
211  if (is_array($cfgRec)) {
212  // Unpack session data:
213  $session_data = unserialize($cfgRec['session_data']);
214  // Select which type:
215  switch ($cfgRec['type']) {
216  case 1:
217  // Records:
218  $this->‪crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
219  break;
220  case 2:
221  // Files
222  $this->‪crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
223  break;
224  case 3:
225  // External URL:
226  $this->‪crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
227  break;
228  case 4:
229  // Page tree:
230  $this->‪crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
231  break;
232  case 5:
233  // Meta
234  // NOOP (should never enter here!)
235  break;
236  default:
237  if (‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
238  $hookObj = GeneralUtility::makeInstance(‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
239  $this->pObj = $pObj;
240  // For addQueueEntryForHook()
241  $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
242  }
243  }
244  // Save process data which might be modified:
245  GeneralUtility::makeInstance(ConnectionPool::class)
246  ->getConnectionForTable('index_config')
247  ->update(
248  'index_config',
249  ['session_data' => serialize($session_data)],
250  ['uid' => (int)$cfgRec['uid']]
251  );
252  }
253  }
254  return ['log' => $params];
255  }
256 
265  public function ‪crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
266  {
267  if ($cfgRec['table2index'] && isset(‪$GLOBALS['TCA'][$cfgRec['table2index']])) {
268  // Init session data array if not already:
269  if (!is_array($session_data)) {
270  $session_data = [
271  'uid' => 0
272  ];
273  }
274  // Init:
275  $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
276  $numberOfRecords = $cfgRec['recordsbatch']
277  ? ‪MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
278  : 100;
279 
280  // Get root line:
281  $rootLine = $this->‪getUidRootLineForClosestTemplate($cfgRec['pid']);
282  // Select
283  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
284  ->getQueryBuilderForTable($cfgRec['table2index']);
285 
286  $baseQueryBuilder = $queryBuilder->select('*')
287  ->from($cfgRec['table2index'])
288  ->where(
289  $queryBuilder->expr()->eq(
290  'pid',
291  $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
292  ),
293  $queryBuilder->expr()->gt(
294  'uid',
295  $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
296  )
297  );
298  $result = $baseQueryBuilder
299  ->setMaxResults($numberOfRecords)
300  ->orderBy('uid')
301  ->execute();
302 
303  // Traverse:
304  while ($row = $result->fetch()) {
305  // Index single record:
306  $this->‪indexSingleRecord($row, $cfgRec, $rootLine);
307  // Update the UID we last processed:
308  $session_data['uid'] = $row['uid'];
309  }
310 
311  $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0);
312  // Finally, set entry for next indexing of batch of records:
313  if ($rowCount) {
314  $nparams = [
315  'indexConfigUid' => $cfgRec['uid'],
316  'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
317  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
318  ];
319  $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
320  }
321  }
322  }
323 
332  public function ‪crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
333  {
334  // Prepare path, making it absolute and checking:
335  $readpath = $params['url'];
336  if (!GeneralUtility::isAbsPath($readpath)) {
337  $readpath = GeneralUtility::getFileAbsFileName($readpath);
338  }
339  if (GeneralUtility::isAllowedAbsPath($readpath)) {
340  if (@is_file($readpath)) {
341  // If file, index it!
342  // Get root line (need to provide this when indexing external files)
343  $rl = $this->‪getUidRootLineForClosestTemplate($cfgRec['pid']);
344  // (Re)-Indexing file on page.
345  $indexerObj = GeneralUtility::makeInstance(\‪TYPO3\CMS\IndexedSearch\Indexer::class);
346  $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
347  $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
348  $indexerObj->hash['phash'] = -1;
349  // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
350  // Index document:
351  $indexerObj->indexRegularDocument(\‪TYPO3\CMS\Core\Utility\‪PathUtility::stripPathSitePrefix($readpath), true);
352  } elseif (@is_dir($readpath)) {
353  // If dir, read content and create new pending items for log:
354  // Select files and directories in path:
355  $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
356  $fileArr = [];
357  $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
358  $directoryList = GeneralUtility::get_dirs($readpath);
359  if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
360  foreach ($directoryList as $subdir) {
361  if ((string)$subdir != '') {
362  $files[] = $readpath . $subdir . '/';
363  }
364  }
365  }
366  $files = GeneralUtility::removePrefixPathFromList($files, ‪Environment::getPublicPath() . '/');
367  // traverse the items and create log entries:
368  foreach ($files as $path) {
369  $this->instanceCounter++;
370  if ($path !== $params['url']) {
371  // Parameters:
372  $nparams = [
373  'indexConfigUid' => $cfgRec['uid'],
374  'url' => $path,
375  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
376  'depth' => $params['depth'] + 1
377  ];
378  $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], ‪$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
379  }
380  }
381  }
382  }
383  }
384 
393  public function ‪crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
394  {
395  // Init session data array if not already:
396  if (!is_array($session_data)) {
397  $session_data = [
398  'urlLog' => [$params['url']]
399  ];
400  }
401  // Index the URL:
402  $rl = $this->‪getUidRootLineForClosestTemplate($cfgRec['pid']);
403  $subUrls = $this->‪indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
404  // Add more elements to log now:
405  if ($params['depth'] < $cfgRec['depth']) {
406  foreach ($subUrls as $url) {
407  if ($url = $this->‪checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
408  if (!$this->‪checkDeniedSuburls($url, $cfgRec['url_deny'])) {
409  $this->instanceCounter++;
410  $session_data['urlLog'][] = $url;
411  // Parameters:
412  $nparams = [
413  'indexConfigUid' => $cfgRec['uid'],
414  'url' => $url,
415  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
416  'depth' => $params['depth'] + 1
417  ];
418  $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], ‪$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
419  }
420  }
421  }
422  }
423  }
424 
433  public function ‪crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
434  {
435  // Base page uid:
436  $pageUid = (int)$params['url'];
437  // Get array of URLs from page:
438  $pageRow = ‪BackendUtility::getRecord('pages', $pageUid);
439  $res = $pObj->getUrlsForPageRow($pageRow);
440  $duplicateTrack = [];
441  // Registry for duplicates
442  $downloadUrls = [];
443  // Dummy.
444  // Submit URLs:
445  if (!empty($res)) {
446  foreach ($res as $paramSetKey => $vv) {
447  $pObj->urlListFromUrlArray($vv, $pageRow, ‪$GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
448  }
449  }
450  // Add subpages to log now:
451  if ($params['depth'] < $cfgRec['depth']) {
452  // Subpages selected
453  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
454  $queryBuilder->getRestrictions()
455  ->removeAll()
456  ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
457  $result = $queryBuilder->select('uid', 'title')
458  ->from('pages')
459  ->where(
460  $queryBuilder->expr()->eq(
461  'pid',
462  $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
463  )
464  )
465  ->execute();
466  // Traverse subpages and add to queue:
467  while ($row = $result->fetch()) {
468  $this->instanceCounter++;
469  $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
470  $session_data['urlLog'][] = $url;
471  // Parameters:
472  $nparams = [
473  'indexConfigUid' => $cfgRec['uid'],
474  'url' => $row['uid'],
475  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
476  'depth' => $params['depth'] + 1
477  ];
478  $pObj->addQueueEntry_callBack(
479  $cfgRec['set_id'],
480  $nparams,
481  $this->callBack,
482  $cfgRec['pid'],
483  ‪$GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
484  );
485  }
486  }
487  }
488 
492  public function ‪cleanUpOldRunningConfigurations()
493  {
494  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
495  // List of tables that store information related to the phash value
496  $tablesToClean = [
497  'index_phash',
498  'index_rel',
499  'index_section',
500  'index_grlist',
501  'index_fulltext',
502  'index_debug'
503  ];
504 
505  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
506  $queryBuilder->getRestrictions()
507  ->removeAll()
508  ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
509 
510  // Lookup running index configurations:
511  $runningIndexingConfigurations = $queryBuilder->select('*')
512  ->from('index_config')
513  ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
514  ->execute()
515  ->fetchAll();
516  // For each running configuration, look up how many log entries there are which are scheduled
517  // for execution and if none, clear the "set_id" (means; Processing was DONE)
518  foreach ($runningIndexingConfigurations as $cfgRec) {
519  // Look for ended processes:
520  $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
521  ->count(
522  '*',
523  'tx_crawler_queue',
524  [
525  'set_id' => (int)$cfgRec['set_id'],
526  'exec_time' => 0
527  ]
528  );
529  if (!$queued_items) {
530  // Lookup old phash rows:
531  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
532  $oldPhashRows = $queryBuilder
533  ->select('phash')
534  ->from('index_phash')
535  ->where(
536  $queryBuilder->expr()->eq(
537  'freeIndexUid',
538  $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
539  ),
540  $queryBuilder->expr()->neq(
541  'freeIndexSetId',
542  $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
543  )
544  )
545  ->execute()
546  ->fetchAll();
547 
548  // Removing old registrations for all tables
549  foreach ($tablesToClean as $table) {
550  $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
551  $queryBuilder->delete($table)
552  ->where(
553  $queryBuilder->expr()->in(
554  'phash',
555  $queryBuilder->createNamedParameter(
556  array_column($oldPhashRows, 'phash'),
557  Connection::PARAM_INT_ARRAY
558  )
559  )
560  )
561  ->execute();
562  }
563 
564  // End process by updating index-config record:
565  $connectionPool->getConnectionForTable('index_config')
566  ->update(
567  'index_config',
568  [
569  'set_id' => 0,
570  'session_data' => ''
571  ],
572  ['uid' => (int)$cfgRec['uid']]
573  );
574  }
575  }
576  }
577 
578  /*****************************************
579  *
580  * Helper functions
581  *
582  *****************************************/
591  public function ‪checkUrl($url, $urlLog, $baseUrl)
592  {
593  $url = preg_replace('/\\/\\/$/', '/', $url);
594  list($url) = explode('#', $url);
595  if (!strstr($url, '../')) {
596  if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
597  if (!in_array($url, $urlLog)) {
598  return $url;
599  }
600  }
601  }
602  }
603 
614  public function ‪indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
615  {
616  // Index external URL:
617  $indexerObj = GeneralUtility::makeInstance(\‪TYPO3\CMS\IndexedSearch\Indexer::class);
618  $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
619  $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
620  $indexerObj->hash['phash'] = -1;
621  // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
622  $indexerObj->indexExternalUrl($url);
623  $url_qParts = parse_url($url);
624  $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
625  $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
626  if (!$baseHref) {
627  // Extract base href from current URL
628  $baseHref = $baseAbsoluteHref;
629  $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
630  }
631  $baseHref = rtrim($baseHref, '/');
632  // Get URLs on this page:
633  $subUrls = [];
634  $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
635  // Traverse links:
636  foreach ($list as $count => $linkInfo) {
637  // Decode entities:
638  $subUrl = htmlspecialchars_decode($linkInfo['href']);
639  $qParts = parse_url($subUrl);
640  if (!$qParts['scheme']) {
641  $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
642  if ($relativeUrl[0] === '/') {
643  $subUrl = $baseAbsoluteHref . $relativeUrl;
644  } else {
645  $subUrl = $baseHref . '/' . $relativeUrl;
646  }
647  }
648  $subUrls[] = $subUrl;
649  }
650  return $subUrls;
651  }
652 
660  public function ‪indexSingleRecord($r, $cfgRec, $rl = null)
661  {
662  // Init:
663  $rl = is_array($rl) ? $rl : $this->‪getUidRootLineForClosestTemplate($cfgRec['pid']);
664  $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
665  $languageField = ‪$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
666  $sys_language_uid = $languageField ? $r[$languageField] : 0;
667  // (Re)-Indexing a row from a table:
668  $indexerObj = GeneralUtility::makeInstance(\‪TYPO3\CMS\IndexedSearch\Indexer::class);
669  parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
670  $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, (bool)$cfgRec['chashcalc']);
671  $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
672  $indexerObj->forceIndexing = true;
673  $theContent = '';
674  foreach ($fieldList as $k => $v) {
675  if (!$k) {
676  $theTitle = $r[$v];
677  } else {
678  $theContent .= $r[$v] . ' ';
679  }
680  }
681  // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
682  $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[‪$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[‪$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
683  }
684 
692  public function ‪getUidRootLineForClosestTemplate($id)
693  {
694  $rootLineUids = [];
695  try {
696  // Gets the rootLine
697  $rootLine = GeneralUtility::makeInstance(RootlineUtility::class, $id)->get();
698  // This generates the constants/config + hierarchy info for the template.
699  $tmpl = GeneralUtility::makeInstance(\‪TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
700  $tmpl->runThroughTemplates($rootLine);
701  // Root line uids
702  foreach ($tmpl->rootLine as $rlkey => $rldat) {
703  $rootLineUids[$rlkey] = $rldat['uid'];
704  }
705  } catch (‪RootLineException $e) {
706  // do nothing
707  }
708  return $rootLineUids;
709  }
710 
717  public function ‪generateNextIndexingTime($cfgRec)
718  {
719  $currentTime = ‪$GLOBALS['EXEC_TIME'];
720  // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
721  if ($cfgRec['timer_frequency'] <= 24 * 3600) {
722  $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
723  } else {
724  $lastTime = $cfgRec['timer_next_indexing'] ?: ‪$GLOBALS['EXEC_TIME'];
725  $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
726  }
727  // Find last offset time plus frequency in seconds:
728  $lastSureOffset = $aMidNight + ‪MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
729  $frequencySeconds = ‪MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
730  // Now, find out how many blocks of the length of frequency there is until the next time:
731  $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
732  // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
733  return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
734  }
735 
743  public function ‪checkDeniedSuburls($url, $url_deny)
744  {
745  if (trim($url_deny)) {
746  $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
747  foreach ($url_denyArray as $testurl) {
748  if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
749  return true;
750  }
751  }
752  }
753  return false;
754  }
755 
762  public function ‪addQueueEntryForHook($cfgRec, $title)
763  {
764  $nparams = [
765  'indexConfigUid' => $cfgRec['uid'],
766  // This must ALWAYS be the cfgRec uid!
767  'url' => $title,
768  'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
769  ];
770  $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
771  }
772 
778  public function ‪deleteFromIndex($id)
779  {
780  $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
781 
782  // Lookup old phash rows:
783 
784  $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
785  $oldPhashRows = $queryBuilder->select('phash')
786  ->from('index_section')
787  ->where(
788  $queryBuilder->expr()->eq(
789  'page_id',
790  $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
791  )
792  )
793  ->execute()
794  ->fetchAll();
795 
796  if (empty($oldPhashRows)) {
797  return;
798  }
799 
800  $tables = [
801  'index_debug',
802  'index_fulltext',
803  'index_grlist',
804  'index_phash',
805  'index_rel',
806  'index_section',
807  ];
808  foreach ($tables as $table) {
809  $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
810  $queryBuilder->delete($table)
811  ->where(
812  $queryBuilder->expr()->in(
813  'phash',
814  $queryBuilder->createNamedParameter(
815  array_column($oldPhashRows, 'phash'),
816  Connection::PARAM_INT_ARRAY
817  )
818  )
819  )
820  ->execute();
821  }
822  }
823 
824  /*************************
825  *
826  * Hook functions for DataHandler (indexing of records)
827  *
828  *************************/
838  public function ‪processCmdmap_preProcess($command, $table, $id, $value, $pObj)
839  {
840  // Clean up the index
841  if ($command === 'delete' && $table === 'pages') {
842  $this->‪deleteFromIndex($id);
843  }
844  }
845 
855  public function ‪processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
856  {
857  // Check if any fields are actually updated:
858  if (empty($fieldArray)) {
859  return;
860  }
861  // Translate new ids.
862  if ($status === 'new') {
863  $id = $pObj->substNEWwithIDs[$id];
864  } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
865  // If the page should be hidden or not indexed after update, delete index for this page
866  $this->‪deleteFromIndex($id);
867  }
868  // Get full record and if exists, search for indexing configurations:
869  $currentRecord = ‪BackendUtility::getRecord($table, $id);
870  if (is_array($currentRecord)) {
871  // Select all (not running) indexing configurations of type "record" (1) and
872  // which points to this table and is located on the same page as the record
873  // or pointing to the right source PID
874  $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
875  ->getQueryBuilderForTable('index_config');
876  $result = $queryBuilder->select('*')
877  ->from('index_config')
878  ->where(
879  $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
880  $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
881  $queryBuilder->expr()->eq(
882  'table2index',
883  $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
884  ),
885  $queryBuilder->expr()->orX(
886  $queryBuilder->expr()->andX(
887  $queryBuilder->expr()->eq(
888  'alternative_source_pid',
889  $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
890  ),
891  $queryBuilder->expr()->eq(
892  'pid',
893  $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
894  )
895  ),
896  $queryBuilder->expr()->eq(
897  'alternative_source_pid',
898  $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
899  )
900  ),
901  $queryBuilder->expr()->eq(
902  'records_indexonchange',
903  $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
904  )
905  )
906  ->execute();
907 
908  while ($cfgRec = $result->fetch()) {
909  $this->‪indexSingleRecord($currentRecord, $cfgRec);
910  }
911  }
912  }
913 }
‪TYPO3\CMS\Core\DataHandling\DataHandler
Definition: DataHandler.php:81
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\checkUrl
‪string checkUrl($url, $urlLog, $baseUrl)
Definition: CrawlerHook.php:588
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\processDatamap_afterDatabaseOperations
‪processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
Definition: CrawlerHook.php:852
‪TYPO3\CMS\Core\Core\Environment\getPublicPath
‪static string getPublicPath()
Definition: Environment.php:153
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook
Definition: CrawlerHook.php:33
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute
‪array crawler_execute($params, &$pObj)
Definition: CrawlerHook.php:189
‪TYPO3\CMS\Core\Utility\PathUtility\stripPathSitePrefix
‪static string stripPathSitePrefix($path)
Definition: PathUtility.php:371
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\indexSingleRecord
‪indexSingleRecord($r, $cfgRec, $rl=null)
Definition: CrawlerHook.php:657
‪TYPO3
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:31
‪TYPO3\CMS\Core\Utility\RootlineUtility
Definition: RootlineUtility.php:36
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute_type3
‪crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
Definition: CrawlerHook.php:390
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\$secondsPerExternalUrl
‪int $secondsPerExternalUrl
Definition: CrawlerHook.php:38
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\$instanceCounter
‪int $instanceCounter
Definition: CrawlerHook.php:44
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\indexExtUrl
‪array indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
Definition: CrawlerHook.php:611
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\cleanUpOldRunningConfigurations
‪cleanUpOldRunningConfigurations()
Definition: CrawlerHook.php:489
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute_type4
‪crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
Definition: CrawlerHook.php:430
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\addQueueEntryForHook
‪addQueueEntryForHook($cfgRec, $title)
Definition: CrawlerHook.php:759
‪TYPO3\CMS\Backend\Utility\BackendUtility
Definition: BackendUtility.php:72
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\$callBack
‪string $callBack
Definition: CrawlerHook.php:48
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute_type2
‪crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
Definition: CrawlerHook.php:329
‪TYPO3\CMS\Backend\Utility\BackendUtility\getRecord
‪static array null getRecord($table, $uid, $fields=' *', $where='', $useDeleteClause=true)
Definition: BackendUtility.php:130
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\deleteFromIndex
‪deleteFromIndex($id)
Definition: CrawlerHook.php:775
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\processCmdmap_preProcess
‪processCmdmap_preProcess($command, $table, $id, $value, $pObj)
Definition: CrawlerHook.php:835
‪TYPO3\CMS\Core\Database\Connection
Definition: Connection.php:31
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_execute_type1
‪crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
Definition: CrawlerHook.php:262
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:5
‪TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction
Definition: DeletedRestriction.php:26
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:39
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:21
‪TYPO3\CMS\Core\Exception\Page\RootLineException
Definition: RootLineException.php:24
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\generateNextIndexingTime
‪int generateNextIndexingTime($cfgRec)
Definition: CrawlerHook.php:714
‪TYPO3\CMS\Core\Database\ConnectionPool
Definition: ConnectionPool.php:44
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:45
‪TYPO3\CMS\IndexedSearch\Hook
Definition: CrawlerFilesHook.php:2
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\checkDeniedSuburls
‪bool checkDeniedSuburls($url, $url_deny)
Definition: CrawlerHook.php:740
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\crawler_init
‪crawler_init(&$pObj)
Definition: CrawlerHook.php:69
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\__construct
‪__construct()
Definition: CrawlerHook.php:53
‪TYPO3\CMS\IndexedSearch\Hook\CrawlerHook\getUidRootLineForClosestTemplate
‪array getUidRootLineForClosestTemplate($id)
Definition: CrawlerHook.php:689