‪TYPO3CMS  11.5
FileContentParser.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
18 use Psr\Log\LogLevel;
26 
33 {
41  public ‪$pdf_mode = -20;
42 
46  public ‪$app = [];
47 
51  public ‪$ext2itemtype_map = [];
52 
57 
61  public ‪$pObj;
62 
66  protected ‪$langObject;
67 
71  protected ‪$lastLocale;
72 
76  public function ‪__construct()
77  {
78  // Set the language object to be used accordant to current application type
79  $this->langObject = ‪ApplicationType::fromRequest(‪$GLOBALS['TYPO3_REQUEST'])->isFrontend() ? ‪$GLOBALS['TSFE'] : ‪$GLOBALS['LANG'];
80  }
81 
88  public function ‪initParser($extension)
89  {
90  // Then read indexer-config and set if appropriate:
91  $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
92  // If windows, apply extension to tool name:
93  $exe = ‪Environment::isWindows() ? '.exe' : '';
94  // lg
95  $extOK = false;
96  $mainExtension = '';
97  // Ignore extensions
98  $ignoreExtensions = ‪GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
99  if (in_array($extension, $ignoreExtensions)) {
100  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), LogLevel::WARNING);
101  return false;
102  }
103  // Switch on file extension:
104  switch ($extension) {
105  case 'pdf':
106  // PDF
107  if ($indexerConfig['pdftools']) {
108  $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
109  if (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe)) {
110  $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
111  $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
112  // PDF mode:
113  $this->pdf_mode = ‪MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
114  $extOK = true;
115  } else {
116  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), LogLevel::ERROR);
117  }
118  } else {
119  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), LogLevel::NOTICE);
120  }
121  break;
122  case 'doc':
123  // Catdoc
124  if ($indexerConfig['catdoc']) {
125  $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
126  if (@is_file($catdocPath . 'catdoc' . $exe)) {
127  $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
128  $extOK = true;
129  } else {
130  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), LogLevel::ERROR);
131  }
132  } else {
133  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), LogLevel::NOTICE);
134  }
135  break;
136  case 'pps':
137  case 'ppt':
138  // MS PowerPoint
139  // ppthtml
140  if ($indexerConfig['ppthtml']) {
141  $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
142  if (@is_file($ppthtmlPath . 'ppthtml' . $exe)) {
143  $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
144  $extOK = true;
145  } else {
146  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), LogLevel::ERROR);
147  }
148  } else {
149  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), LogLevel::NOTICE);
150  }
151  break;
152  case 'xls':
153  // MS Excel
154  // Xlhtml
155  if ($indexerConfig['xlhtml']) {
156  $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
157  if (@is_file($xlhtmlPath . 'xlhtml' . $exe)) {
158  $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
159  $extOK = true;
160  } else {
161  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), LogLevel::ERROR);
162  }
163  } else {
164  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), LogLevel::NOTICE);
165  }
166  break;
167  case 'docx': // Microsoft Word >= 2007
168  case 'dotx':
169  case 'pptx': // Microsoft PowerPoint >= 2007
170  case 'ppsx':
171  case 'potx':
172  case 'xlsx': // Microsoft Excel >= 2007
173  case 'xltx':
174  if ($indexerConfig['unzip']) {
175  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
176  if (@is_file($unzipPath . 'unzip' . $exe)) {
177  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
178  $extOK = true;
179  } else {
180  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), LogLevel::ERROR);
181  }
182  } else {
183  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), LogLevel::NOTICE);
184  }
185  break;
186  case 'sxc':
187  case 'sxi':
188  case 'sxw':
189  case 'ods':
190  case 'odp':
191  case 'odt':
192  // Oasis OpenDocument Text
193  if ($indexerConfig['unzip']) {
194  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
195  if (@is_file($unzipPath . 'unzip' . $exe)) {
196  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
197  $extOK = true;
198  } else {
199  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), LogLevel::ERROR);
200  }
201  } else {
202  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), LogLevel::NOTICE);
203  }
204  break;
205  case 'rtf':
206  // Catdoc
207  if ($indexerConfig['unrtf']) {
208  $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
209  if (@is_file($unrtfPath . 'unrtf' . $exe)) {
210  $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
211  $extOK = true;
212  } else {
213  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), LogLevel::ERROR);
214  }
215  } else {
216  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), LogLevel::NOTICE);
217  }
218  break;
219  case 'txt':
220  case 'csv':
221  case 'xml':
222  case 'tif':
223  // PHP EXIF
224  $extOK = true;
225  break;
226  case 'html':
227  case 'htm':
228  // PHP strip-tags()
229  $extOK = true;
230  $mainExtension = 'html';
231  // making "html" the common "item_type"
232  break;
233  case 'jpg':
234  case 'jpeg':
235  // PHP EXIF
236  $extOK = true;
237  $mainExtension = 'jpeg';
238  // making "jpeg" the common item_type
239  break;
240  }
241  // If extension was OK:
242  if ($extOK) {
243  $this->supportedExtensions[$extension] = true;
244  $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
245  return true;
246  }
247  return false;
248  }
249 
257  public function ‪softInit($extension)
258  {
259  switch ($extension) {
260  case 'pdf':
261  case 'doc':
262  case 'docx':
263  case 'dotx':
264  case 'pps':
265  case 'ppsx':
266  case 'ppt':
267  case 'pptx':
268  case 'potx':
269  case 'xls':
270  case 'xlsx':
271  case 'xltx':
272  case 'sxc':
273  case 'sxi':
274  case 'sxw':
275  case 'ods':
276  case 'odp':
277  case 'odt':
278  case 'rtf':
279  case 'txt':
280  case 'html':
281  case 'htm':
282  case 'csv':
283  case 'xml':
284  case 'jpg':
285  case 'jpeg':
286  case 'tif':
287  // TIF images (EXIF comment)
288  return true;
289  }
290  return false;
291  }
292 
299  public function ‪searchTypeMediaTitle($extension)
300  {
301  // Read indexer-config
302  $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
303  // Ignore extensions
304  $ignoreExtensions = ‪GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
305  if (in_array($extension, $ignoreExtensions)) {
306  return false;
307  }
308  // Switch on file extension:
309  switch ($extension) {
310  case 'pdf':
311  // PDF
312  if ($indexerConfig['pdftools']) {
313  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
314  }
315  break;
316  case 'doc':
317  // Catdoc
318  if ($indexerConfig['catdoc']) {
319  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
320  }
321  break;
322  case 'pps':
323  case 'ppt':
324  // MS PowerPoint
325  // ppthtml
326  if ($indexerConfig['ppthtml']) {
327  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
328  }
329  break;
330  case 'xls':
331  // MS Excel
332  // Xlhtml
333  if ($indexerConfig['xlhtml']) {
334  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
335  }
336  break;
337  case 'docx':
338  case 'dotx':
339  // Microsoft Word >= 2007
340  if ($indexerConfig['unzip']) {
341  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
342  }
343  break;
344  case 'pptx': // Microsoft PowerPoint >= 2007
345  case 'ppsx':
346  case 'potx':
347  if ($indexerConfig['unzip']) {
348  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
349  }
350  break;
351  case 'xlsx': // Microsoft Excel >= 2007
352  case 'xltx':
353  if ($indexerConfig['unzip']) {
354  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
355  }
356  break;
357  case 'sxc':
358  // Open Office Calc.
359  if ($indexerConfig['unzip']) {
360  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
361  }
362  break;
363  case 'sxi':
364  // Open Office Impress
365  if ($indexerConfig['unzip']) {
366  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
367  }
368  break;
369  case 'sxw':
370  // Open Office Writer
371  if ($indexerConfig['unzip']) {
372  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
373  }
374  break;
375  case 'ods':
376  // Oasis OpenDocument Spreadsheet
377  if ($indexerConfig['unzip']) {
378  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
379  }
380  break;
381  case 'odp':
382  // Oasis OpenDocument Presentation
383  if ($indexerConfig['unzip']) {
384  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
385  }
386  break;
387  case 'odt':
388  // Oasis OpenDocument Text
389  if ($indexerConfig['unzip']) {
390  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
391  }
392  break;
393  case 'rtf':
394  // Catdoc
395  if ($indexerConfig['unrtf']) {
396  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
397  }
398  break;
399  case 'jpeg':
400  case 'jpg':
401  case 'tif':
402  // PHP EXIF
403  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.images'), $extension);
404  case 'html':
405  case 'htm':
406  // PHP strip-tags()
407  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
408  case 'txt':
409  // Raw text
410  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
411  case 'csv':
412  // Raw text
413  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
414  case 'xml':
415  // PHP strip-tags()
416  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
417  default:
418  // Do nothing
419  }
420  return '';
421  }
422 
429  public function ‪isMultiplePageExtension($extension)
430  {
431  // Switch on file extension:
432  switch ((string)$extension) {
433  case 'pdf':
434  return true;
435  }
436  return false;
437  }
438 
445  protected function ‪sL($reference)
446  {
447  return $this->langObject->sL($reference);
448  }
449 
450  /************************
451  *
452  * Reading documents (for parsing)
453  *
454  ************************/
463  public function ‪readFileContent($ext, $absFile, $cPKey)
464  {
465  $contentArr = null;
466  // Return immediately if initialization didn't set support up:
467  if (!$this->supportedExtensions[$ext]) {
468  return false;
469  }
470  // Switch by file extension
471  switch ($ext) {
472  case 'pdf':
473  if ($this->app['pdfinfo']) {
475  // Getting pdf-info:
476  $cmd = $this->app['pdfinfo'] . ' -enc UTF-8 ' . escapeshellarg($absFile);
477  ‪CommandUtility::exec($cmd, $res);
478  $pdfInfo = $this->‪splitPdfInfo($res);
479  unset($res);
480  if ((int)($pdfInfo['pages'] ?? 0)) {
481  [$low, $high] = explode('-', $cPKey);
482  // Get pdf content:
483  $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
484  // Create temporary name
485  @unlink($tempFileName);
486  // Delete if exists, just to be safe.
487  $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
489  if (@is_file($tempFileName)) {
490  $content = (string)file_get_contents($tempFileName);
491  unlink($tempFileName);
492  } else {
493  $content = '';
494  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), LogLevel::WARNING);
495  }
496  if ((string)$content !== '') {
497  $contentArr = $this->pObj->splitRegularContent($this->‪removeEndJunk($content));
498  }
499  }
500  if (!empty($pdfInfo['title'])) {
501  $contentArr['title'] = $pdfInfo['title'];
502  }
503  $this->‪setLocaleForServerFileSystem(true);
504  }
505  break;
506  case 'doc':
507  if ($this->app['catdoc']) {
509  $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
510  ‪CommandUtility::exec($cmd, $res);
511  $content = implode(LF, $res);
512  unset($res);
513  $contentArr = $this->pObj->splitRegularContent($this->‪removeEndJunk($content));
514  $this->‪setLocaleForServerFileSystem(true);
515  }
516  break;
517  case 'pps':
518  case 'ppt':
519  if ($this->app['ppthtml']) {
521  $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
522  ‪CommandUtility::exec($cmd, $res);
523  $content = implode(LF, $res);
524  unset($res);
525  $content = $this->pObj->convertHTMLToUtf8($content);
526  $contentArr = $this->pObj->splitHTMLContent($this->‪removeEndJunk($content));
527  $contentArr['title'] = ‪PathUtility::basename($absFile);
528  $this->‪setLocaleForServerFileSystem(true);
529  }
530  break;
531  case 'xls':
532  if ($this->app['xlhtml']) {
534  $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
535  ‪CommandUtility::exec($cmd, $res);
536  $content = implode(LF, $res);
537  unset($res);
538  $content = $this->pObj->convertHTMLToUtf8($content);
539  $contentArr = $this->pObj->splitHTMLContent($this->‪removeEndJunk($content));
540  $contentArr['title'] = ‪PathUtility::basename($absFile);
541  $this->‪setLocaleForServerFileSystem(true);
542  }
543  break;
544  case 'docx':
545  case 'dotx':
546  case 'pptx':
547  case 'ppsx':
548  case 'potx':
549  case 'xlsx':
550  case 'xltx':
551  if ($this->app['unzip']) {
553  switch ($ext) {
554  case 'docx':
555  case 'dotx':
556  // Read document.xml:
557  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
558  break;
559  case 'ppsx':
560  case 'pptx':
561  case 'potx':
562  // Read slide1.xml:
563  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
564  break;
565  case 'xlsx':
566  case 'xltx':
567  // Read sheet1.xml:
568  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
569  break;
570  default:
571  $cmd = '';
572  break;
573  }
574  ‪CommandUtility::exec($cmd, $res);
575  $content_xml = implode(LF, $res);
576  unset($res);
577  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
578  $contentArr = $this->pObj->splitRegularContent($utf8_content);
579  // Make sure the title doesn't expose the absolute path!
580  $contentArr['title'] = ‪PathUtility::basename($absFile);
581  // Meta information
582  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
583  ‪CommandUtility::exec($cmd, $res);
584  $meta_xml = implode(LF, $res);
585  unset($res);
586  $metaContent = GeneralUtility::xml2tree($meta_xml);
587  if (is_array($metaContent)) {
588  $contentArr['title'] .= ' ' . ($metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0] ?? '');
589  $contentArr['description'] = ($metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0] ?? '');
590  $contentArr['description'] .= ' ' . ($metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0] ?? '');
591  $contentArr['keywords'] = ($metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0] ?? '');
592  }
593  $this->‪setLocaleForServerFileSystem(true);
594  }
595  break;
596  case 'sxi':
597  case 'sxc':
598  case 'sxw':
599  case 'ods':
600  case 'odp':
601  case 'odt':
602  if ($this->app['unzip']) {
604  // Read content.xml:
605  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
606  ‪CommandUtility::exec($cmd, $res);
607  $content_xml = implode(LF, $res);
608  unset($res);
609  // Read meta.xml:
610  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
611  ‪CommandUtility::exec($cmd, $res);
612  $meta_xml = implode(LF, $res);
613  unset($res);
614  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
615  $contentArr = $this->pObj->splitRegularContent($utf8_content);
616  $contentArr['title'] = ‪PathUtility::basename($absFile);
617  // Make sure the title doesn't expose the absolute path!
618  // Meta information
619  $metaContent = GeneralUtility::xml2tree($meta_xml);
620  $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
621  if (is_array($metaContent)) {
622  $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ?: $contentArr['title'];
623  $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
624  // Keywords collected:
625  if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
626  foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
627  $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
628  }
629  }
630  }
631  $this->‪setLocaleForServerFileSystem(true);
632  }
633  break;
634  case 'rtf':
635  if ($this->app['unrtf']) {
637  $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
638  ‪CommandUtility::exec($cmd, $res);
639  $fileContent = implode(LF, $res);
640  unset($res);
641  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
642  $contentArr = $this->pObj->splitHTMLContent($fileContent);
643  $this->‪setLocaleForServerFileSystem(true);
644  }
645  break;
646  case 'txt':
647  case 'csv':
649  // Raw text
650  $content = ‪GeneralUtility::getUrl($absFile);
651  // @todo Implement auto detection of charset (currently assuming utf-8)
652  $contentCharset = 'utf-8';
653  $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
654  $contentArr = $this->pObj->splitRegularContent($content);
655  $contentArr['title'] = ‪PathUtility::basename($absFile);
656  // Make sure the title doesn't expose the absolute path!
657  $this->‪setLocaleForServerFileSystem(true);
658  break;
659  case 'html':
660  case 'htm':
661  $fileContent = ‪GeneralUtility::getUrl($absFile);
662  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
663  $contentArr = $this->pObj->splitHTMLContent($fileContent);
664  break;
665  case 'xml':
667  // PHP strip-tags()
668  $fileContent = ‪GeneralUtility::getUrl($absFile);
669  // Finding charset:
670  preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
671  $charset = $reg[1] ? trim(strtolower($reg[1])) : 'utf-8';
672  // Converting content:
673  $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
674  $contentArr = $this->pObj->splitRegularContent($fileContent);
675  $contentArr['title'] = ‪PathUtility::basename($absFile);
676  // Make sure the title doesn't expose the absolute path!
677  $this->‪setLocaleForServerFileSystem(true);
678  break;
679  case 'jpg':
680  case 'jpeg':
681  case 'tif':
683  // PHP EXIF
684  if (function_exists('exif_read_data')) {
685  $exif = @exif_read_data($absFile, 'IFD0');
686  } else {
687  $exif = false;
688  }
689  if ($exif) {
690  $comment = trim(($exif['COMMENT'][0] ?? '') . ' ' . ($exif['ImageDescription'] ?? ''));
691  } else {
692  $comment = '';
693  }
694  $contentArr = $this->pObj->splitRegularContent($comment);
695  $contentArr['title'] = ‪PathUtility::basename($absFile);
696  // Make sure the title doesn't expose the absolute path!
697  $this->‪setLocaleForServerFileSystem(true);
698  break;
699  default:
700  return false;
701  }
702  // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
703  if (is_array($contentArr) && !$contentArr['title']) {
704  // Substituting "_" for " " because many filenames may have this instead of a space char.
705  $contentArr['title'] = str_replace('_', ' ', ‪PathUtility::basename($absFile));
706  }
707  return $contentArr;
708  }
709 
720  protected function ‪setLocaleForServerFileSystem($resetLocale = false)
721  {
722  if (!‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
723  return;
724  }
725 
726  if ($resetLocale) {
727  if ($this->lastLocale == null) {
728  throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
729  }
730  setlocale(LC_CTYPE, $this->lastLocale);
731  $this->lastLocale = null;
732  } else {
733  if ($this->lastLocale !== null) {
734  throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
735  }
736  $this->lastLocale = setlocale(LC_CTYPE, '0') ?: null;
737  setlocale(LC_CTYPE, ‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
738  }
739  }
740 
751  public function ‪fileContentParts($ext, $absFile)
752  {
753  $cParts = [0];
754  switch ($ext) {
755  case 'pdf':
757  // Getting pdf-info:
758  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
759  ‪CommandUtility::exec($cmd, $res);
760  $pdfInfo = $this->‪splitPdfInfo($res);
761  unset($res);
762  if ((int)$pdfInfo['pages']) {
763  $cParts = [];
764  // Calculate mode
765  if ($this->pdf_mode > 0) {
766  $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
767  } else {
768  $iter = ‪MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
769  }
770  // Traverse and create intervals.
771  for ($a = 0; $a < $iter; $a++) {
772  $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
773  $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
774  $cParts[] = $low . '-' . $high;
775  }
776  }
777  $this->‪setLocaleForServerFileSystem(true);
778  break;
779  default:
780  }
781  return $cParts;
782  }
783 
792  public function ‪splitPdfInfo($pdfInfoArray)
793  {
794  $res = [];
795  if (is_array($pdfInfoArray)) {
796  foreach ($pdfInfoArray as $line) {
797  $parts = explode(':', $line, 2);
798  if (count($parts) > 1 && trim($parts[0])) {
799  $res[strtolower(trim($parts[0]))] = trim($parts[1]);
800  }
801  }
802  }
803  return $res;
804  }
805 
812  public function ‪removeEndJunk($string)
813  {
814  return trim((string)preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
815  }
816 
817  /************************
818  *
819  * Backend analyzer
820  *
821  ************************/
828  public function ‪getIcon($extension)
829  {
830  if ($extension === 'htm') {
831  $extension = 'html';
832  } elseif ($extension === 'jpeg') {
833  $extension = 'jpg';
834  }
835  return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
836  }
837 }
‪TYPO3\CMS\IndexedSearch\FileContentParser\softInit
‪bool softInit($extension)
Definition: FileContentParser.php:250
‪TYPO3\CMS\Core\Http\ApplicationType\fromRequest
‪static static fromRequest(ServerRequestInterface $request)
Definition: ApplicationType.php:62
‪TYPO3\CMS\IndexedSearch\FileContentParser\removeEndJunk
‪string removeEndJunk($string)
Definition: FileContentParser.php:805
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static list< string > trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
Definition: GeneralUtility.php:999
‪TYPO3\CMS\IndexedSearch\FileContentParser\$app
‪array $app
Definition: FileContentParser.php:44
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:25
‪TYPO3\CMS\IndexedSearch\FileContentParser\splitPdfInfo
‪array splitPdfInfo($pdfInfoArray)
Definition: FileContentParser.php:785
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:45
‪TYPO3\CMS\IndexedSearch\FileContentParser\$lastLocale
‪string null $lastLocale
Definition: FileContentParser.php:64
‪TYPO3\CMS\IndexedSearch\FileContentParser\setLocaleForServerFileSystem
‪setLocaleForServerFileSystem($resetLocale=false)
Definition: FileContentParser.php:713
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:32
‪TYPO3\CMS\IndexedSearch\FileContentParser\$pdf_mode
‪int $pdf_mode
Definition: FileContentParser.php:40
‪TYPO3\CMS\Core\Core\Environment\isWindows
‪static bool isWindows()
Definition: Environment.php:318
‪TYPO3\CMS\IndexedSearch\FileContentParser\sL
‪string sL($reference)
Definition: FileContentParser.php:438
‪TYPO3\CMS\IndexedSearch\FileContentParser\$pObj
‪TYPO3 CMS IndexedSearch Indexer $pObj
Definition: FileContentParser.php:56
‪TYPO3\CMS\IndexedSearch\FileContentParser\$langObject
‪TYPO3 CMS Core Localization LanguageService TYPO3 CMS Frontend Controller TypoScriptFrontendController $langObject
Definition: FileContentParser.php:60
‪TYPO3\CMS\IndexedSearch\FileContentParser\initParser
‪bool initParser($extension)
Definition: FileContentParser.php:81
‪TYPO3\CMS\Core\Http\ApplicationType
Definition: ApplicationType.php:52
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static string basename($path)
Definition: PathUtility.php:226
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static string false getUrl($url)
Definition: GeneralUtility.php:1697
‪TYPO3\CMS\IndexedSearch\FileContentParser\readFileContent
‪array false null readFileContent($ext, $absFile, $cPKey)
Definition: FileContentParser.php:456
‪TYPO3\CMS\IndexedSearch\FileContentParser\fileContentParts
‪array fileContentParts($ext, $absFile)
Definition: FileContentParser.php:744
‪TYPO3\CMS\IndexedSearch\FileContentParser\$supportedExtensions
‪array $supportedExtensions
Definition: FileContentParser.php:52
‪TYPO3\CMS\Core\Utility\CommandUtility\exec
‪static string exec($command, &$output=null, &$returnValue=0)
Definition: CommandUtility.php:81
‪TYPO3\CMS\IndexedSearch\FileContentParser
Definition: FileContentParser.php:33
‪TYPO3\CMS\IndexedSearch\FileContentParser\searchTypeMediaTitle
‪string false searchTypeMediaTitle($extension)
Definition: FileContentParser.php:292
‪TYPO3\CMS\IndexedSearch\FileContentParser\__construct
‪__construct()
Definition: FileContentParser.php:69
‪TYPO3\CMS\IndexedSearch\FileContentParser\getIcon
‪string getIcon($extension)
Definition: FileContentParser.php:821
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:104
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:25
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:43
‪TYPO3\CMS\IndexedSearch\FileContentParser\$ext2itemtype_map
‪array $ext2itemtype_map
Definition: FileContentParser.php:48
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:22
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:50
‪TYPO3\CMS\Core\Utility\CommandUtility
Definition: CommandUtility.php:49
‪TYPO3\CMS\IndexedSearch\FileContentParser\isMultiplePageExtension
‪bool isMultiplePageExtension($extension)
Definition: FileContentParser.php:422