‪TYPO3CMS  ‪main
FileContentParser.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
18 use Psr\Log\LogLevel;
29 
36 {
42  public int ‪$pdf_mode = -20;
43  public array ‪$app = [];
44  public array ‪$ext2itemtype_map = [];
45  public array ‪$supportedExtensions = [];
46  public Indexer ‪$pObj;
48  protected ?string ‪$lastLocale = null;
49 
53  public function ‪__construct()
54  {
55  // Set the language object to be used accordant to current application type
56  $this->langObject = ‪ApplicationType::fromRequest(‪$GLOBALS['TYPO3_REQUEST'])->isFrontend() ? ‪$GLOBALS['TSFE'] : ‪$GLOBALS['LANG'];
57  }
58 
65  public function ‪initParser(string $extension): bool
66  {
67  // Then read indexer-config and set if appropriate:
68  $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
69  // If windows, apply extension to tool name:
70  $exe = ‪Environment::isWindows() ? '.exe' : '';
71  // lg
72  $extOK = false;
73  $mainExtension = '';
74  // Ignore extensions
75  $ignoreExtensions = ‪GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
76  if (in_array($extension, $ignoreExtensions)) {
77  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), LogLevel::WARNING);
78  return false;
79  }
80  // Switch on file extension:
81  switch ($extension) {
82  case 'pdf':
83  // PDF
84  if ($indexerConfig['pdftools']) {
85  $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
86  if (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe)) {
87  $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
88  $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
89  // PDF mode:
90  $this->pdf_mode = ‪MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
91  $extOK = true;
92  } else {
93  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), LogLevel::ERROR);
94  }
95  } else {
96  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), LogLevel::NOTICE);
97  }
98  break;
99  case 'doc':
100  // Catdoc
101  if ($indexerConfig['catdoc']) {
102  $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
103  if (@is_file($catdocPath . 'catdoc' . $exe)) {
104  $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
105  $extOK = true;
106  } else {
107  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), LogLevel::ERROR);
108  }
109  } else {
110  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), LogLevel::NOTICE);
111  }
112  break;
113  case 'pps':
114  case 'ppt':
115  // MS PowerPoint
116  // ppthtml
117  if ($indexerConfig['ppthtml']) {
118  $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
119  if (@is_file($ppthtmlPath . 'ppthtml' . $exe)) {
120  $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
121  $extOK = true;
122  } else {
123  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), LogLevel::ERROR);
124  }
125  } else {
126  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), LogLevel::NOTICE);
127  }
128  break;
129  case 'xls':
130  // MS Excel
131  // Xlhtml
132  if ($indexerConfig['xlhtml']) {
133  $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
134  if (@is_file($xlhtmlPath . 'xlhtml' . $exe)) {
135  $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
136  $extOK = true;
137  } else {
138  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), LogLevel::ERROR);
139  }
140  } else {
141  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), LogLevel::NOTICE);
142  }
143  break;
144  case 'docx': // Microsoft Word >= 2007
145  case 'dotx':
146  case 'pptx': // Microsoft PowerPoint >= 2007
147  case 'ppsx':
148  case 'potx':
149  case 'xlsx': // Microsoft Excel >= 2007
150  case 'xltx':
151  if ($indexerConfig['unzip']) {
152  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
153  if (@is_file($unzipPath . 'unzip' . $exe)) {
154  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
155  $extOK = true;
156  } else {
157  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), LogLevel::ERROR);
158  }
159  } else {
160  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), LogLevel::NOTICE);
161  }
162  break;
163  case 'sxc':
164  case 'sxi':
165  case 'sxw':
166  case 'ods':
167  case 'odp':
168  case 'odt':
169  // Oasis OpenDocument Text
170  if ($indexerConfig['unzip']) {
171  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
172  if (@is_file($unzipPath . 'unzip' . $exe)) {
173  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
174  $extOK = true;
175  } else {
176  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), LogLevel::ERROR);
177  }
178  } else {
179  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), LogLevel::NOTICE);
180  }
181  break;
182  case 'rtf':
183  // Catdoc
184  if ($indexerConfig['unrtf']) {
185  $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
186  if (@is_file($unrtfPath . 'unrtf' . $exe)) {
187  $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
188  $extOK = true;
189  } else {
190  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), LogLevel::ERROR);
191  }
192  } else {
193  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), LogLevel::NOTICE);
194  }
195  break;
196  case 'txt':
197  case 'csv':
198  case 'xml':
199  case 'tif':
200  // PHP EXIF
201  $extOK = true;
202  break;
203  case 'html':
204  case 'htm':
205  // PHP strip-tags()
206  $extOK = true;
207  $mainExtension = 'html';
208  // making "html" the common "item_type"
209  break;
210  case 'jpg':
211  case 'jpeg':
212  // PHP EXIF
213  $extOK = true;
214  $mainExtension = 'jpeg';
215  // making "jpeg" the common item_type
216  break;
217  }
218  // If extension was OK:
219  if ($extOK) {
220  $this->supportedExtensions[$extension] = true;
221  $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
222  return true;
223  }
224  return false;
225  }
226 
234  public function ‪softInit(string $extension): bool
235  {
236  switch ($extension) {
237  case 'pdf':
238  case 'doc':
239  case 'docx':
240  case 'dotx':
241  case 'pps':
242  case 'ppsx':
243  case 'ppt':
244  case 'pptx':
245  case 'potx':
246  case 'xls':
247  case 'xlsx':
248  case 'xltx':
249  case 'sxc':
250  case 'sxi':
251  case 'sxw':
252  case 'ods':
253  case 'odp':
254  case 'odt':
255  case 'rtf':
256  case 'txt':
257  case 'html':
258  case 'htm':
259  case 'csv':
260  case 'xml':
261  case 'jpg':
262  case 'jpeg':
263  case 'tif':
264  // TIF images (EXIF comment)
265  return true;
266  }
267  return false;
268  }
269 
276  public function ‪searchTypeMediaTitle(string $extension): false|string
277  {
278  // Read indexer-config
279  $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
280  // Ignore extensions
281  $ignoreExtensions = ‪GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
282  if (in_array($extension, $ignoreExtensions, true)) {
283  return false;
284  }
285  // Switch on file extension:
286  switch ($extension) {
287  case 'pdf':
288  // PDF
289  if ($indexerConfig['pdftools']) {
290  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
291  }
292  break;
293  case 'doc':
294  // Catdoc
295  if ($indexerConfig['catdoc']) {
296  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
297  }
298  break;
299  case 'pps':
300  case 'ppt':
301  // MS PowerPoint
302  // ppthtml
303  if ($indexerConfig['ppthtml']) {
304  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
305  }
306  break;
307  case 'xls':
308  // MS Excel
309  // Xlhtml
310  if ($indexerConfig['xlhtml']) {
311  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
312  }
313  break;
314  case 'docx':
315  case 'dotx':
316  // Microsoft Word >= 2007
317  if ($indexerConfig['unzip']) {
318  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
319  }
320  break;
321  case 'pptx': // Microsoft PowerPoint >= 2007
322  case 'ppsx':
323  case 'potx':
324  if ($indexerConfig['unzip']) {
325  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
326  }
327  break;
328  case 'xlsx': // Microsoft Excel >= 2007
329  case 'xltx':
330  if ($indexerConfig['unzip']) {
331  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
332  }
333  break;
334  case 'sxc':
335  // Open Office Calc.
336  if ($indexerConfig['unzip']) {
337  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
338  }
339  break;
340  case 'sxi':
341  // Open Office Impress
342  if ($indexerConfig['unzip']) {
343  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
344  }
345  break;
346  case 'sxw':
347  // Open Office Writer
348  if ($indexerConfig['unzip']) {
349  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
350  }
351  break;
352  case 'ods':
353  // Oasis OpenDocument Spreadsheet
354  if ($indexerConfig['unzip']) {
355  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
356  }
357  break;
358  case 'odp':
359  // Oasis OpenDocument Presentation
360  if ($indexerConfig['unzip']) {
361  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
362  }
363  break;
364  case 'odt':
365  // Oasis OpenDocument Text
366  if ($indexerConfig['unzip']) {
367  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
368  }
369  break;
370  case 'rtf':
371  // Catdoc
372  if ($indexerConfig['unrtf']) {
373  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
374  }
375  break;
376  case 'jpeg':
377  case 'jpg':
378  case 'tif':
379  // PHP EXIF
380  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.images'), $extension);
381  case 'html':
382  case 'htm':
383  // PHP strip-tags()
384  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
385  case 'txt':
386  // Raw text
387  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
388  case 'csv':
389  // Raw text
390  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
391  case 'xml':
392  // PHP strip-tags()
393  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
394  default:
395  // Do nothing
396  }
397  return '';
398  }
399 
406  public function ‪isMultiplePageExtension(string $extension): bool
407  {
408  return $extension === 'pdf';
409  }
410 
417  protected function ‪sL(string $reference): string
418  {
419  return $this->langObject->sL($reference);
420  }
421 
422  /************************
423  *
424  * Reading documents (for parsing)
425  *
426  ************************/
435  public function ‪readFileContent(string $ext, string $absFile, string|int $cPKey): ‪IndexingDataAsString|false|null
436  {
437  $indexingDataDto = new ‪IndexingDataAsString();
438  // Return immediately if initialization didn't set support up:
439  if (!$this->supportedExtensions[$ext]) {
440  return false;
441  }
442  // Switch by file extension
443  switch ($ext) {
444  case 'pdf':
445  if ($this->app['pdfinfo']) {
447  // Getting pdf-info:
448  $cmd = $this->app['pdfinfo'] . ' -enc UTF-8 ' . escapeshellarg($absFile);
449  ‪CommandUtility::exec($cmd, $res);
450  $pdfInfo = $this->‪splitPdfInfo($res);
451  unset($res);
452  if ((int)($pdfInfo['pages'] ?? 0)) {
453  [$low, $high] = explode('-', $cPKey);
454  // Get pdf content:
455  $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
456  // Create temporary name
457  @unlink($tempFileName);
458  // Delete if exists, just to be safe.
459  $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
461  if (@is_file($tempFileName)) {
462  $content = (string)file_get_contents($tempFileName);
463  unlink($tempFileName);
464  } else {
465  $content = '';
466  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), LogLevel::WARNING);
467  }
468  if ($content !== '') {
469  $indexingDataDto = $this->pObj->splitRegularContent($this->‪removeEndJunk($content));
470  }
471  }
472  if (!empty($pdfInfo['title'])) {
473  $indexingDataDto->title = $pdfInfo['title'];
474  }
475  $this->‪setLocaleForServerFileSystem(true);
476  }
477  break;
478  case 'doc':
479  if ($this->app['catdoc']) {
481  $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
482  ‪CommandUtility::exec($cmd, $res);
483  $content = implode(LF, $res);
484  unset($res);
485  $indexingDataDto = $this->pObj->splitRegularContent($this->‪removeEndJunk($content));
486  $this->‪setLocaleForServerFileSystem(true);
487  }
488  break;
489  case 'pps':
490  case 'ppt':
491  if ($this->app['ppthtml']) {
493  $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
494  ‪CommandUtility::exec($cmd, $res);
495  $content = implode(LF, $res);
496  unset($res);
497  $content = $this->pObj->convertHTMLToUtf8($content);
498  $indexingDataDto = $this->pObj->splitHTMLContent($this->‪removeEndJunk($content));
499  $indexingDataDto->title = ‪PathUtility::basename($absFile);
500  $this->‪setLocaleForServerFileSystem(true);
501  }
502  break;
503  case 'xls':
504  if ($this->app['xlhtml']) {
506  $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
507  ‪CommandUtility::exec($cmd, $res);
508  $content = implode(LF, $res);
509  unset($res);
510  $content = $this->pObj->convertHTMLToUtf8($content);
511  $indexingDataDto = $this->pObj->splitHTMLContent($this->‪removeEndJunk($content));
512  $indexingDataDto->title = ‪PathUtility::basename($absFile);
513  $this->‪setLocaleForServerFileSystem(true);
514  }
515  break;
516  case 'docx':
517  case 'dotx':
518  case 'pptx':
519  case 'ppsx':
520  case 'potx':
521  case 'xlsx':
522  case 'xltx':
523  if ($this->app['unzip']) {
525  switch ($ext) {
526  case 'docx':
527  case 'dotx':
528  // Read document.xml:
529  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
530  break;
531  case 'ppsx':
532  case 'pptx':
533  case 'potx':
534  // Read slide1.xml:
535  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
536  break;
537  case 'xlsx':
538  case 'xltx':
539  // Read sheet1.xml:
540  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
541  break;
542  default:
543  $cmd = '';
544  break;
545  }
546  ‪CommandUtility::exec($cmd, $res);
547  $content_xml = implode(LF, $res);
548  unset($res);
549  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
550  $indexingDataDto = $this->pObj->splitRegularContent($utf8_content);
551  // Make sure the title doesn't expose the absolute path!
552  $indexingDataDto->title = ‪PathUtility::basename($absFile);
553  // Meta information
554  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
555  ‪CommandUtility::exec($cmd, $res);
556  $meta_xml = implode(LF, $res);
557  unset($res);
558  $metaContent = GeneralUtility::xml2tree($meta_xml);
559  if (is_array($metaContent)) {
560  $indexingDataDto->title .= ' ' . ($metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0] ?? '');
561  $indexingDataDto->description = ($metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0] ?? '');
562  $indexingDataDto->description .= ' ' . ($metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0] ?? '');
563  $indexingDataDto->keywords = ($metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0] ?? '');
564  }
565  $this->‪setLocaleForServerFileSystem(true);
566  }
567  break;
568  case 'sxi':
569  case 'sxc':
570  case 'sxw':
571  case 'ods':
572  case 'odp':
573  case 'odt':
574  if ($this->app['unzip']) {
576  // Read content.xml:
577  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
578  ‪CommandUtility::exec($cmd, $res);
579  $content_xml = implode(LF, $res);
580  unset($res);
581  // Read meta.xml:
582  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
583  ‪CommandUtility::exec($cmd, $res);
584  $meta_xml = implode(LF, $res);
585  unset($res);
586  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
587  $indexingDataDto = $this->pObj->splitRegularContent($utf8_content);
588  $indexingDataDto->title = ‪PathUtility::basename($absFile);
589  // Make sure the title doesn't expose the absolute path!
590  // Meta information
591  $metaContent = GeneralUtility::xml2tree($meta_xml);
592  $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
593  if (is_array($metaContent)) {
594  $indexingDataDto->title = $metaContent['dc:title'][0]['values'][0] ?: $indexingDataDto->title;
595  $indexingDataDto->description = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
596  // Keywords collected:
597  if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
598  foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
599  $indexingDataDto->keywords .= $kwDat['values'][0] . ' ';
600  }
601  }
602  }
603  $this->‪setLocaleForServerFileSystem(true);
604  }
605  break;
606  case 'rtf':
607  if ($this->app['unrtf']) {
609  $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
610  ‪CommandUtility::exec($cmd, $res);
611  $fileContent = implode(LF, $res);
612  unset($res);
613  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
614  $indexingDataDto = $this->pObj->splitHTMLContent($fileContent);
615  $this->‪setLocaleForServerFileSystem(true);
616  }
617  break;
618  case 'txt':
619  case 'csv':
621  // Raw text
622  $content = ‪GeneralUtility::getUrl($absFile);
623  // @todo Implement auto detection of charset (currently assuming utf-8)
624  $contentCharset = 'utf-8';
625  $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
626  $indexingDataDto = $this->pObj->splitRegularContent($content);
627  $indexingDataDto->title = ‪PathUtility::basename($absFile);
628  // Make sure the title doesn't expose the absolute path!
629  $this->‪setLocaleForServerFileSystem(true);
630  break;
631  case 'html':
632  case 'htm':
633  $fileContent = ‪GeneralUtility::getUrl($absFile);
634  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
635  $indexingDataDto = $this->pObj->splitHTMLContent($fileContent);
636  break;
637  case 'xml':
639  // PHP strip-tags()
640  $fileContent = ‪GeneralUtility::getUrl($absFile);
641  // Finding charset:
642  preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
643  $charset = $reg[1] ? trim(strtolower($reg[1])) : 'utf-8';
644  // Converting content:
645  $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
646  $indexingDataDto = $this->pObj->splitRegularContent($fileContent);
647  $indexingDataDto->title = ‪PathUtility::basename($absFile);
648  // Make sure the title doesn't expose the absolute path!
649  $this->‪setLocaleForServerFileSystem(true);
650  break;
651  case 'jpg':
652  case 'jpeg':
653  case 'tif':
655  // PHP EXIF
656  if (function_exists('exif_read_data')) {
657  $exif = @exif_read_data($absFile, 'IFD0');
658  } else {
659  $exif = false;
660  }
661  if ($exif) {
662  $comment = trim(($exif['COMMENT'][0] ?? '') . ' ' . ($exif['ImageDescription'] ?? ''));
663  } else {
664  $comment = '';
665  }
666  $indexingDataDto = $this->pObj->splitRegularContent($comment);
667  $indexingDataDto->title = ‪PathUtility::basename($absFile);
668  // Make sure the title doesn't expose the absolute path!
669  $this->‪setLocaleForServerFileSystem(true);
670  break;
671  default:
672  return false;
673  }
674  // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
675  if (!$indexingDataDto->title) {
676  // Substituting "_" for " " because many filenames may have this instead of a space char.
677  $indexingDataDto->title = str_replace('_', ' ', ‪PathUtility::basename($absFile));
678  }
679  return $indexingDataDto;
680  }
681 
692  protected function ‪setLocaleForServerFileSystem(bool $resetLocale = false): void
693  {
694  if (!‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
695  return;
696  }
697 
698  if ($resetLocale) {
699  if ($this->lastLocale === null) {
700  throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
701  }
702  setlocale(LC_CTYPE, $this->lastLocale);
703  $this->lastLocale = null;
704  } else {
705  if ($this->lastLocale !== null) {
706  throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
707  }
708  $this->lastLocale = setlocale(LC_CTYPE, '0') ?: null;
709  if (‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale'] ?? false) {
710  setlocale(LC_CTYPE, ‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
711  }
712  }
713  }
714 
725  public function ‪fileContentParts(string $ext, string $absFile): array
726  {
727  $cParts = [0];
728  switch ($ext) {
729  case 'pdf':
731  // Getting pdf-info:
732  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
733  ‪CommandUtility::exec($cmd, $res);
734  $pdfInfo = $this->‪splitPdfInfo($res);
735  unset($res);
736  if ((int)$pdfInfo['pages']) {
737  $cParts = [];
738  // Calculate mode
739  if ($this->pdf_mode > 0) {
740  $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
741  } else {
742  $iter = ‪MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
743  }
744  // Traverse and create intervals.
745  for ($a = 0; $a < $iter; $a++) {
746  $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
747  $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
748  $cParts[] = $low . '-' . $high;
749  }
750  }
751  $this->‪setLocaleForServerFileSystem(true);
752  break;
753  default:
754  }
755  return $cParts;
756  }
757 
766  public function ‪splitPdfInfo(array $pdfInfoArray): array
767  {
768  $res = [];
769  foreach ($pdfInfoArray as $line) {
770  $parts = explode(':', $line, 2);
771  if (count($parts) > 1 && trim($parts[0])) {
772  $res[strtolower(trim($parts[0]))] = trim($parts[1]);
773  }
774  }
775  return $res;
776  }
777 
784  public function ‪removeEndJunk(string $string): string
785  {
786  return trim((string)preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
787  }
788 
789  /************************
790  *
791  * Backend analyzer
792  *
793  ************************/
800  public function ‪getIcon(string $extension): string
801  {
802  if ($extension === 'htm') {
803  $extension = 'html';
804  } elseif ($extension === 'jpeg') {
805  $extension = 'jpg';
806  }
807  return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
808  }
809 }
‪TYPO3\CMS\IndexedSearch\FileContentParser\$app
‪array $app
Definition: FileContentParser.php:43
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:27
‪TYPO3\CMS\IndexedSearch\Dto\IndexingDataAsString
Definition: IndexingDataAsString.php:24
‪TYPO3\CMS\IndexedSearch\FileContentParser\removeEndJunk
‪string removeEndJunk(string $string)
Definition: FileContentParser.php:784
‪TYPO3\CMS\IndexedSearch\FileContentParser\$langObject
‪LanguageService TypoScriptFrontendController $langObject
Definition: FileContentParser.php:47
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:47
‪TYPO3\CMS\IndexedSearch\FileContentParser\$pdf_mode
‪int $pdf_mode
Definition: FileContentParser.php:42
‪TYPO3\CMS\IndexedSearch\FileContentParser\searchTypeMediaTitle
‪string false searchTypeMediaTitle(string $extension)
Definition: FileContentParser.php:276
‪TYPO3\CMS\IndexedSearch\FileContentParser\softInit
‪bool softInit(string $extension)
Definition: FileContentParser.php:234
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static basename(string $path)
Definition: PathUtility.php:219
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static string false getUrl(string $url)
Definition: GeneralUtility.php:1444
‪TYPO3\CMS\IndexedSearch\FileContentParser\$supportedExtensions
‪array $supportedExtensions
Definition: FileContentParser.php:45
‪TYPO3\CMS\IndexedSearch\FileContentParser\sL
‪string sL(string $reference)
Definition: FileContentParser.php:417
‪TYPO3\CMS\Core\Utility\CommandUtility\exec
‪static exec(string $command, ?array &$output=null, int &$returnValue=0)
Definition: CommandUtility.php:85
‪TYPO3\CMS\IndexedSearch\FileContentParser\setLocaleForServerFileSystem
‪setLocaleForServerFileSystem(bool $resetLocale=false)
Definition: FileContentParser.php:692
‪TYPO3\CMS\IndexedSearch\FileContentParser\getIcon
‪string getIcon(string $extension)
Definition: FileContentParser.php:800
‪TYPO3\CMS\IndexedSearch\FileContentParser
Definition: FileContentParser.php:36
‪TYPO3\CMS\IndexedSearch\FileContentParser\$pObj
‪Indexer $pObj
Definition: FileContentParser.php:46
‪TYPO3\CMS\IndexedSearch\FileContentParser\__construct
‪__construct()
Definition: FileContentParser.php:53
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:58
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:25
‪TYPO3\CMS\IndexedSearch\FileContentParser\readFileContent
‪IndexingDataAsString false null readFileContent(string $ext, string $absFile, string|int $cPKey)
Definition: FileContentParser.php:435
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:41
‪TYPO3\CMS\IndexedSearch\FileContentParser\$ext2itemtype_map
‪array $ext2itemtype_map
Definition: FileContentParser.php:44
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:24
‪TYPO3\CMS\IndexedSearch\FileContentParser\fileContentParts
‪array fileContentParts(string $ext, string $absFile)
Definition: FileContentParser.php:725
‪TYPO3\CMS\Core\Http\fromRequest
‪@ fromRequest
Definition: ApplicationType.php:66
‪TYPO3\CMS\Core\Localization\LanguageService
Definition: LanguageService.php:46
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange(mixed $theInt, int $min, int $max=2000000000, int $defaultValue=0)
Definition: MathUtility.php:34
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:52
‪TYPO3\CMS\IndexedSearch\FileContentParser\splitPdfInfo
‪array splitPdfInfo(array $pdfInfoArray)
Definition: FileContentParser.php:766
‪TYPO3\CMS\Core\Utility\CommandUtility
Definition: CommandUtility.php:54
‪TYPO3\CMS\IndexedSearch\FileContentParser\$lastLocale
‪string $lastLocale
Definition: FileContentParser.php:48
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static list< string > trimExplode(string $delim, string $string, bool $removeEmptyValues=false, int $limit=0)
Definition: GeneralUtility.php:822
‪TYPO3\CMS\Core\Http\ApplicationType
‪ApplicationType
Definition: ApplicationType.php:55
‪TYPO3\CMS\IndexedSearch\FileContentParser\initParser
‪bool initParser(string $extension)
Definition: FileContentParser.php:65
‪TYPO3\CMS\IndexedSearch\FileContentParser\isMultiplePageExtension
‪bool isMultiplePageExtension(string $extension)
Definition: FileContentParser.php:406
‪TYPO3\CMS\Core\Core\Environment\isWindows
‪static isWindows()
Definition: Environment.php:276