TYPO3 CMS  TYPO3_8-7
CrawlerHook.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
19 
24 {
30  public function initMessage()
31  {
32  return 'Start of Custom Example Indexing session!';
33  }
34 
45  public function indexOperation($cfgRec, &$session_data, $params, &$pObj)
46  {
47  // Set up language uid, if any:
48  $sys_language_uid = 0;
49 
50  // Init session data array if not already:
51  if (!is_array($session_data)) {
52  $session_data = [
53  'step' => 0
54  ];
55  }
56  // Increase step counter (this is just an example of how the session data can be used - to track how many instances of indexing is left)
57  $session_data['step']++;
58  switch ((int)$session_data['step']) {
59  case 1:
60  // Indexing Example: Content accessed with GET parameters added to URL:
61  // Get rootline from the Indexing Record (needed because the indexer relates all search results to a position in the page tree!) [DON'T CHANGE]:
62  $rl = $pObj->getUidRootLineForClosestTemplate($cfgRec['pid']);
63  // Set up 2 example items to index:
64  $exampleItems = [
65  [
66  'ID' => '123',
67  'title' => 'Title of Example 1',
68  'content' => 'Vestibulum leo turpis, fringilla sit amet, semper eget, vestibulum ut, arcu. Vestibulum mauris orci, vulputate quis, congue eget, nonummy'
69  ],
70  [
71  'ID' => 'example2',
72  'title' => 'Title of Example 2',
73  'content' => 'Cras tortor turpis, vulputate non, accumsan a, pretium in, magna. Cras turpis turpis, pretium pulvinar, pretium vel, nonummy eu.'
74  ]
75  ];
76  // For each item, index it (this is what you might like to do in batches of like 100 items if all your content spans thousands of items!)
77  foreach ($exampleItems as $item) {
78  // Prepare the GET variables array that must be added to the page URL in order to view result:
79  parse_str('&itemID=' . rawurlencode($item['ID']), $GETparams);
80  // Prepare indexer (make instance, initialize it, set special features for indexing parameterized content - probably none of this should be changed by you) [DON'T CHANGE]:
82  $indexerObj = GeneralUtility::makeInstance(Indexer::class);
83  $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, false);
84  $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
85  $indexerObj->forceIndexing = true;
86  // Indexing the content of the item (see \TYPO3\CMS\IndexedSearch\Indexer::backend_indexAsTYPO3Page() for options)
87  $indexerObj->backend_indexAsTYPO3Page($item['title'], '', '', $item['content'], 'utf-8', $item['tstamp'], $item['create_date'], $item['ID']);
88  }
89  break;
90  case 2:
91  // Indexing Example: Content accessed directly in file system:
92  // Get rootline from the Indexing Record (needed because the indexer relates all search results to a position in the page tree!) [DON'T CHANGE]:
93  $rl = $pObj->getUidRootLineForClosestTemplate($cfgRec['pid']);
94  // Prepare indexer (make instance, initialize it, set special features for indexing parameterized content - probably none of this should be changed by you) [DON'T CHANGE]:
96  $indexerObj = GeneralUtility::makeInstance(Indexer::class);
97  $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl);
98  $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
99  $indexerObj->hash['phash'] = -1;
100  // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
101  // Index document:
102  $indexerObj->indexRegularDocument('fileadmin/templates/index.html', true);
103  break;
104  case 3:
105  // Indexing Example: Content accessed on External URLs:
106  // Index external URL:
108  $indexerObj = GeneralUtility::makeInstance(Indexer::class);
109  $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', null);
110  $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
111  $indexerObj->hash['phash'] = -1;
112  // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
113  // Index external URL (HTML only):
114  $indexerObj->indexExternalUrl('http://www.google.com/');
115  break;
116  }
117  // Finally, set entry for next indexing instance (if all steps are not completed)
118  if ($session_data['step'] <= 3) {
119  $title = 'Step #' . $session_data['step'] . ' of 3';
120  // Just information field. Never mind that the field is called "url" - this is what will be shown in the "crawler" log. Could be a URL - or whatever else tells what that indexing instance will do.
121  $pObj->addQueueEntryForHook($cfgRec, $title);
122  }
123  }
124 }
static makeInstance($className,... $constructorArguments)