‪TYPO3CMS  10.4
FileContentParser.php
Go to the documentation of this file.
1 <?php
2 
3 /*
4  * This file is part of the TYPO3 CMS project.
5  *
6  * It is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License, either version 2
8  * of the License, or any later version.
9  *
10  * For the full copyright and license information, please read the
11  * LICENSE.txt file that was distributed with this source code.
12  *
13  * The TYPO3 project - inspiring people to share!
14  */
15 
17 
24 
30 {
38  public ‪$pdf_mode = -20;
39 
43  public ‪$app = [];
44 
48  public ‪$ext2itemtype_map = [];
49 
54 
58  public ‪$pObj;
59 
63  protected ‪$langObject;
64 
68  protected ‪$lastLocale;
69 
73  public function ‪__construct()
74  {
75  // Set the language object to be used accordant to current TYPO3_MODE:
76  $this->langObject = TYPO3_MODE === 'FE' ? ‪$GLOBALS['TSFE'] : ‪$GLOBALS['LANG'];
77  }
78 
85  public function ‪initParser($extension)
86  {
87  // Then read indexer-config and set if appropriate:
88  $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
89  // If windows, apply extension to tool name:
90  $exe = ‪Environment::isWindows() ? '.exe' : '';
91  // lg
92  $extOK = false;
93  $mainExtension = '';
94  // Ignore extensions
95  $ignoreExtensions = ‪GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
96  if (in_array($extension, $ignoreExtensions)) {
97  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
98  return false;
99  }
100  // Switch on file extension:
101  switch ($extension) {
102  case 'pdf':
103  // PDF
104  if ($indexerConfig['pdftools']) {
105  $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
106  if (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe)) {
107  $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
108  $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
109  // PDF mode:
110  $this->pdf_mode = ‪MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
111  $extOK = true;
112  } else {
113  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
114  }
115  } else {
116  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
117  }
118  break;
119  case 'doc':
120  // Catdoc
121  if ($indexerConfig['catdoc']) {
122  $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
123  if (@is_file($catdocPath . 'catdoc' . $exe)) {
124  $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
125  $extOK = true;
126  } else {
127  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
128  }
129  } else {
130  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
131  }
132  break;
133  case 'pps':
134  case 'ppt':
135  // MS PowerPoint
136  // ppthtml
137  if ($indexerConfig['ppthtml']) {
138  $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
139  if (@is_file($ppthtmlPath . 'ppthtml' . $exe)) {
140  $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
141  $extOK = true;
142  } else {
143  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
144  }
145  } else {
146  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
147  }
148  break;
149  case 'xls':
150  // MS Excel
151  // Xlhtml
152  if ($indexerConfig['xlhtml']) {
153  $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
154  if (@is_file($xlhtmlPath . 'xlhtml' . $exe)) {
155  $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
156  $extOK = true;
157  } else {
158  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
159  }
160  } else {
161  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
162  }
163  break;
164  case 'docx': // Microsoft Word >= 2007
165  case 'dotx':
166  case 'pptx': // Microsoft PowerPoint >= 2007
167  case 'ppsx':
168  case 'potx':
169  case 'xlsx': // Microsoft Excel >= 2007
170  case 'xltx':
171  if ($indexerConfig['unzip']) {
172  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
173  if (@is_file($unzipPath . 'unzip' . $exe)) {
174  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
175  $extOK = true;
176  } else {
177  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
178  }
179  } else {
180  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
181  }
182  break;
183  case 'sxc':
184  case 'sxi':
185  case 'sxw':
186  case 'ods':
187  case 'odp':
188  case 'odt':
189  // Oasis OpenDocument Text
190  if ($indexerConfig['unzip']) {
191  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
192  if (@is_file($unzipPath . 'unzip' . $exe)) {
193  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
194  $extOK = true;
195  } else {
196  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
197  }
198  } else {
199  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
200  }
201  break;
202  case 'rtf':
203  // Catdoc
204  if ($indexerConfig['unrtf']) {
205  $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
206  if (@is_file($unrtfPath . 'unrtf' . $exe)) {
207  $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
208  $extOK = true;
209  } else {
210  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
211  }
212  } else {
213  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
214  }
215  break;
216  case 'txt':
217  case 'csv':
218  case 'xml':
219  case 'tif':
220  // PHP EXIF
221  $extOK = true;
222  break;
223  case 'html':
224  case 'htm':
225  // PHP strip-tags()
226  $extOK = true;
227  $mainExtension = 'html';
228  // making "html" the common "item_type"
229  break;
230  case 'jpg':
231  case 'jpeg':
232  // PHP EXIF
233  $extOK = true;
234  $mainExtension = 'jpeg';
235  // making "jpeg" the common item_type
236  break;
237  }
238  // If extension was OK:
239  if ($extOK) {
240  $this->supportedExtensions[$extension] = true;
241  $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
242  return true;
243  }
244  return false;
245  }
246 
254  public function ‪softInit($extension)
255  {
256  switch ($extension) {
257  case 'pdf':
258  case 'doc':
259  case 'docx':
260  case 'dotx':
261  case 'pps':
262  case 'ppsx':
263  case 'ppt':
264  case 'pptx':
265  case 'potx':
266  case 'xls':
267  case 'xlsx':
268  case 'xltx':
269  case 'sxc':
270  case 'sxi':
271  case 'sxw':
272  case 'ods':
273  case 'odp':
274  case 'odt':
275  case 'rtf':
276  case 'txt':
277  case 'html':
278  case 'htm':
279  case 'csv':
280  case 'xml':
281  case 'jpg':
282  case 'jpeg':
283  case 'tif':
284  // TIF images (EXIF comment)
285  return true;
286  }
287  return false;
288  }
289 
296  public function ‪searchTypeMediaTitle($extension)
297  {
298  // Read indexer-config
299  $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
300  // Ignore extensions
301  $ignoreExtensions = ‪GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
302  if (in_array($extension, $ignoreExtensions)) {
303  return false;
304  }
305  // Switch on file extension:
306  switch ($extension) {
307  case 'pdf':
308  // PDF
309  if ($indexerConfig['pdftools']) {
310  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
311  }
312  break;
313  case 'doc':
314  // Catdoc
315  if ($indexerConfig['catdoc']) {
316  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
317  }
318  break;
319  case 'pps':
320  case 'ppt':
321  // MS PowerPoint
322  // ppthtml
323  if ($indexerConfig['ppthtml']) {
324  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
325  }
326  break;
327  case 'xls':
328  // MS Excel
329  // Xlhtml
330  if ($indexerConfig['xlhtml']) {
331  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
332  }
333  break;
334  case 'docx':
335  case 'dotx':
336  // Microsoft Word >= 2007
337  if ($indexerConfig['unzip']) {
338  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
339  }
340  break;
341  case 'pptx': // Microsoft PowerPoint >= 2007
342  case 'ppsx':
343  case 'potx':
344  if ($indexerConfig['unzip']) {
345  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
346  }
347  break;
348  case 'xlsx': // Microsoft Excel >= 2007
349  case 'xltx':
350  if ($indexerConfig['unzip']) {
351  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
352  }
353  break;
354  case 'sxc':
355  // Open Office Calc.
356  if ($indexerConfig['unzip']) {
357  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
358  }
359  break;
360  case 'sxi':
361  // Open Office Impress
362  if ($indexerConfig['unzip']) {
363  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
364  }
365  break;
366  case 'sxw':
367  // Open Office Writer
368  if ($indexerConfig['unzip']) {
369  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
370  }
371  break;
372  case 'ods':
373  // Oasis OpenDocument Spreadsheet
374  if ($indexerConfig['unzip']) {
375  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
376  }
377  break;
378  case 'odp':
379  // Oasis OpenDocument Presentation
380  if ($indexerConfig['unzip']) {
381  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
382  }
383  break;
384  case 'odt':
385  // Oasis OpenDocument Text
386  if ($indexerConfig['unzip']) {
387  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
388  }
389  break;
390  case 'rtf':
391  // Catdoc
392  if ($indexerConfig['unrtf']) {
393  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
394  }
395  break;
396  case 'jpeg':
397  case 'jpg':
398  case 'tif':
399  // PHP EXIF
400  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.images'), $extension);
401  case 'html':
402  case 'htm':
403  // PHP strip-tags()
404  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
405  case 'txt':
406  // Raw text
407  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
408  case 'csv':
409  // Raw text
410  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
411  case 'xml':
412  // PHP strip-tags()
413  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
414  default:
415  // Do nothing
416  }
417  return '';
418  }
419 
426  public function ‪isMultiplePageExtension($extension)
427  {
428  // Switch on file extension:
429  switch ((string)$extension) {
430  case 'pdf':
431  return true;
432  }
433  return false;
434  }
435 
442  protected function ‪sL($reference)
443  {
444  return $this->langObject->sL($reference);
445  }
446 
447  /************************
448  *
449  * Reading documents (for parsing)
450  *
451  ************************/
460  public function ‪readFileContent($ext, $absFile, $cPKey)
461  {
462  $cmd = null;
463  $contentArr = null;
464  // Return immediately if initialization didn't set support up:
465  if (!$this->supportedExtensions[$ext]) {
466  return false;
467  }
468  // Switch by file extension
469  switch ($ext) {
470  case 'pdf':
471  if ($this->app['pdfinfo']) {
473  // Getting pdf-info:
474  $cmd = $this->app['pdfinfo'] . ' -enc UTF-8 ' . escapeshellarg($absFile);
475  ‪CommandUtility::exec($cmd, $res);
476  $pdfInfo = $this->‪splitPdfInfo($res);
477  unset($res);
478  if ((int)$pdfInfo['pages']) {
479  [$low, $high] = explode('-', $cPKey);
480  // Get pdf content:
481  $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
482  // Create temporary name
483  @unlink($tempFileName);
484  // Delete if exists, just to be safe.
485  $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
487  if (@is_file($tempFileName)) {
488  $content = (string)file_get_contents($tempFileName);
489  unlink($tempFileName);
490  } else {
491  $content = '';
492  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
493  }
494  if ((string)$content !== '') {
495  $contentArr = $this->pObj->splitRegularContent($this->‪removeEndJunk($content));
496  }
497  }
498  if (!empty($pdfInfo['title'])) {
499  $contentArr['title'] = $pdfInfo['title'];
500  }
501  $this->‪setLocaleForServerFileSystem(true);
502  }
503  break;
504  case 'doc':
505  if ($this->app['catdoc']) {
507  $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
508  ‪CommandUtility::exec($cmd, $res);
509  $content = implode(LF, $res);
510  unset($res);
511  $contentArr = $this->pObj->splitRegularContent($this->‪removeEndJunk($content));
512  $this->‪setLocaleForServerFileSystem(true);
513  }
514  break;
515  case 'pps':
516  case 'ppt':
517  if ($this->app['ppthtml']) {
519  $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
520  ‪CommandUtility::exec($cmd, $res);
521  $content = implode(LF, $res);
522  unset($res);
523  $content = $this->pObj->convertHTMLToUtf8($content);
524  $contentArr = $this->pObj->splitHTMLContent($this->‪removeEndJunk($content));
525  $contentArr['title'] = ‪PathUtility::basename($absFile);
526  $this->‪setLocaleForServerFileSystem(true);
527  }
528  break;
529  case 'xls':
530  if ($this->app['xlhtml']) {
532  $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
533  ‪CommandUtility::exec($cmd, $res);
534  $content = implode(LF, $res);
535  unset($res);
536  $content = $this->pObj->convertHTMLToUtf8($content);
537  $contentArr = $this->pObj->splitHTMLContent($this->‪removeEndJunk($content));
538  $contentArr['title'] = ‪PathUtility::basename($absFile);
539  $this->‪setLocaleForServerFileSystem(true);
540  }
541  break;
542  case 'docx':
543  case 'dotx':
544  case 'pptx':
545  case 'ppsx':
546  case 'potx':
547  case 'xlsx':
548  case 'xltx':
549  if ($this->app['unzip']) {
551  switch ($ext) {
552  case 'docx':
553  case 'dotx':
554  // Read document.xml:
555  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
556  break;
557  case 'ppsx':
558  case 'pptx':
559  case 'potx':
560  // Read slide1.xml:
561  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
562  break;
563  case 'xlsx':
564  case 'xltx':
565  // Read sheet1.xml:
566  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
567  break;
568  default:
569  $cmd = '';
570  break;
571  }
572  ‪CommandUtility::exec($cmd, $res);
573  $content_xml = implode(LF, $res);
574  unset($res);
575  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
576  $contentArr = $this->pObj->splitRegularContent($utf8_content);
577  // Make sure the title doesn't expose the absolute path!
578  $contentArr['title'] = ‪PathUtility::basename($absFile);
579  // Meta information
580  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
581  ‪CommandUtility::exec($cmd, $res);
582  $meta_xml = implode(LF, $res);
583  unset($res);
584  $metaContent = GeneralUtility::xml2tree($meta_xml);
585  if (is_array($metaContent)) {
586  $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
587  $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
588  $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
589  $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
590  }
591  $this->‪setLocaleForServerFileSystem(true);
592  }
593  break;
594  case 'sxi':
595  case 'sxc':
596  case 'sxw':
597  case 'ods':
598  case 'odp':
599  case 'odt':
600  if ($this->app['unzip']) {
602  // Read content.xml:
603  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
604  ‪CommandUtility::exec($cmd, $res);
605  $content_xml = implode(LF, $res);
606  unset($res);
607  // Read meta.xml:
608  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
609  ‪CommandUtility::exec($cmd, $res);
610  $meta_xml = implode(LF, $res);
611  unset($res);
612  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
613  $contentArr = $this->pObj->splitRegularContent($utf8_content);
614  $contentArr['title'] = ‪PathUtility::basename($absFile);
615  // Make sure the title doesn't expose the absolute path!
616  // Meta information
617  $metaContent = GeneralUtility::xml2tree($meta_xml);
618  $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
619  if (is_array($metaContent)) {
620  $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ?: $contentArr['title'];
621  $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
622  // Keywords collected:
623  if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
624  foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
625  $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
626  }
627  }
628  }
629  $this->‪setLocaleForServerFileSystem(true);
630  }
631  break;
632  case 'rtf':
633  if ($this->app['unrtf']) {
635  $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
636  ‪CommandUtility::exec($cmd, $res);
637  $fileContent = implode(LF, $res);
638  unset($res);
639  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
640  $contentArr = $this->pObj->splitHTMLContent($fileContent);
641  $this->‪setLocaleForServerFileSystem(true);
642  }
643  break;
644  case 'txt':
645  case 'csv':
647  // Raw text
648  $content = ‪GeneralUtility::getUrl($absFile);
649  // @todo Implement auto detection of charset (currently assuming utf-8)
650  $contentCharset = 'utf-8';
651  $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
652  $contentArr = $this->pObj->splitRegularContent($content);
653  $contentArr['title'] = ‪PathUtility::basename($absFile);
654  // Make sure the title doesn't expose the absolute path!
655  $this->‪setLocaleForServerFileSystem(true);
656  break;
657  case 'html':
658  case 'htm':
659  $fileContent = ‪GeneralUtility::getUrl($absFile);
660  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
661  $contentArr = $this->pObj->splitHTMLContent($fileContent);
662  break;
663  case 'xml':
665  // PHP strip-tags()
666  $fileContent = ‪GeneralUtility::getUrl($absFile);
667  // Finding charset:
668  preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
669  $charset = $reg[1] ? trim(strtolower($reg[1])) : 'utf-8';
670  // Converting content:
671  $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
672  $contentArr = $this->pObj->splitRegularContent($fileContent);
673  $contentArr['title'] = ‪PathUtility::basename($absFile);
674  // Make sure the title doesn't expose the absolute path!
675  $this->‪setLocaleForServerFileSystem(true);
676  break;
677  case 'jpg':
678  case 'jpeg':
679  case 'tif':
681  // PHP EXIF
682  if (function_exists('exif_read_data')) {
683  $exif = @exif_read_data($absFile, 'IFD0');
684  } else {
685  $exif = false;
686  }
687  if ($exif) {
688  $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
689  } else {
690  $comment = '';
691  }
692  $contentArr = $this->pObj->splitRegularContent($comment);
693  $contentArr['title'] = ‪PathUtility::basename($absFile);
694  // Make sure the title doesn't expose the absolute path!
695  $this->‪setLocaleForServerFileSystem(true);
696  break;
697  default:
698  return false;
699  }
700  // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
701  if (is_array($contentArr) && !$contentArr['title']) {
702  // Substituting "_" for " " because many filenames may have this instead of a space char.
703  $contentArr['title'] = str_replace('_', ' ', ‪PathUtility::basename($absFile));
704  }
705  return $contentArr;
706  }
707 
718  protected function ‪setLocaleForServerFileSystem($resetLocale = false)
719  {
720  if (!‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
721  return;
722  }
723 
724  if ($resetLocale) {
725  if ($this->lastLocale == null) {
726  throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
727  }
728  setlocale(LC_CTYPE, $this->lastLocale);
729  $this->lastLocale = null;
730  } else {
731  if ($this->lastLocale !== null) {
732  throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
733  }
734  $this->lastLocale = setlocale(LC_CTYPE, '0');
735  setlocale(LC_CTYPE, ‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
736  }
737  }
738 
749  public function ‪fileContentParts($ext, $absFile)
750  {
751  $cParts = [0];
752  switch ($ext) {
753  case 'pdf':
755  // Getting pdf-info:
756  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
757  ‪CommandUtility::exec($cmd, $res);
758  $pdfInfo = $this->‪splitPdfInfo($res);
759  unset($res);
760  if ((int)$pdfInfo['pages']) {
761  $cParts = [];
762  // Calculate mode
763  if ($this->pdf_mode > 0) {
764  $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
765  } else {
766  $iter = ‪MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
767  }
768  // Traverse and create intervals.
769  for ($a = 0; $a < $iter; $a++) {
770  $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
771  $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
772  $cParts[] = $low . '-' . $high;
773  }
774  }
775  $this->‪setLocaleForServerFileSystem(true);
776  break;
777  default:
778  }
779  return $cParts;
780  }
781 
790  public function ‪splitPdfInfo($pdfInfoArray)
791  {
792  $res = [];
793  if (is_array($pdfInfoArray)) {
794  foreach ($pdfInfoArray as $line) {
795  $parts = explode(':', $line, 2);
796  if (count($parts) > 1 && trim($parts[0])) {
797  $res[strtolower(trim($parts[0]))] = trim($parts[1]);
798  }
799  }
800  }
801  return $res;
802  }
803 
810  public function ‪removeEndJunk($string)
811  {
812  return trim((string)preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
813  }
814 
815  /************************
816  *
817  * Backend analyzer
818  *
819  ************************/
826  public function ‪getIcon($extension)
827  {
828  if ($extension === 'htm') {
829  $extension = 'html';
830  } elseif ($extension === 'jpeg') {
831  $extension = 'jpg';
832  }
833  return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
834  }
835 }
‪TYPO3\CMS\IndexedSearch\FileContentParser\softInit
‪bool softInit($extension)
Definition: FileContentParser.php:247
‪TYPO3\CMS\IndexedSearch\FileContentParser\removeEndJunk
‪string removeEndJunk($string)
Definition: FileContentParser.php:803
‪TYPO3\CMS\IndexedSearch\FileContentParser\$app
‪array $app
Definition: FileContentParser.php:41
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:24
‪TYPO3\CMS\IndexedSearch\FileContentParser\splitPdfInfo
‪array splitPdfInfo($pdfInfoArray)
Definition: FileContentParser.php:783
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:45
‪TYPO3\CMS\IndexedSearch\FileContentParser\setLocaleForServerFileSystem
‪setLocaleForServerFileSystem($resetLocale=false)
Definition: FileContentParser.php:711
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:32
‪TYPO3\CMS\Core\Utility\GeneralUtility\getUrl
‪static mixed getUrl($url, $includeHeader=0, $requestHeaders=null, &$report=null)
Definition: GeneralUtility.php:1748
‪TYPO3\CMS\IndexedSearch\FileContentParser\$pdf_mode
‪int $pdf_mode
Definition: FileContentParser.php:37
‪TYPO3\CMS\Core\Core\Environment\isWindows
‪static bool isWindows()
Definition: Environment.php:292
‪TYPO3\CMS\IndexedSearch\FileContentParser\sL
‪string sL($reference)
Definition: FileContentParser.php:435
‪TYPO3\CMS\IndexedSearch\FileContentParser\$pObj
‪TYPO3 CMS IndexedSearch Indexer $pObj
Definition: FileContentParser.php:53
‪TYPO3\CMS\IndexedSearch\FileContentParser\$langObject
‪TYPO3 CMS Core Localization LanguageService TYPO3 CMS Frontend Controller TypoScriptFrontendController $langObject
Definition: FileContentParser.php:57
‪TYPO3\CMS\IndexedSearch\FileContentParser\initParser
‪bool initParser($extension)
Definition: FileContentParser.php:78
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static string basename($path)
Definition: PathUtility.php:165
‪TYPO3\CMS\IndexedSearch\FileContentParser\fileContentParts
‪array fileContentParts($ext, $absFile)
Definition: FileContentParser.php:742
‪TYPO3\CMS\IndexedSearch\FileContentParser\$supportedExtensions
‪array $supportedExtensions
Definition: FileContentParser.php:49
‪TYPO3\CMS\Core\Utility\CommandUtility\exec
‪static string exec($command, &$output=null, &$returnValue=0)
Definition: CommandUtility.php:81
‪TYPO3\CMS\IndexedSearch\FileContentParser
Definition: FileContentParser.php:30
‪TYPO3\CMS\IndexedSearch\FileContentParser\__construct
‪__construct()
Definition: FileContentParser.php:66
‪TYPO3\CMS\IndexedSearch\FileContentParser\getIcon
‪string getIcon($extension)
Definition: FileContentParser.php:819
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static string[] trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
Definition: GeneralUtility.php:1059
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:98
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:5
‪TYPO3\CMS\IndexedSearch\FileContentParser\searchTypeMediaTitle
‪string searchTypeMediaTitle($extension)
Definition: FileContentParser.php:289
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:40
‪TYPO3\CMS\IndexedSearch\FileContentParser\$ext2itemtype_map
‪array $ext2itemtype_map
Definition: FileContentParser.php:45
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:22
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:46
‪TYPO3\CMS\IndexedSearch\FileContentParser\readFileContent
‪array readFileContent($ext, $absFile, $cPKey)
Definition: FileContentParser.php:453
‪TYPO3\CMS\Core\Utility\CommandUtility
Definition: CommandUtility.php:49
‪TYPO3\CMS\IndexedSearch\FileContentParser\$lastLocale
‪string $lastLocale
Definition: FileContentParser.php:61
‪TYPO3\CMS\IndexedSearch\FileContentParser\isMultiplePageExtension
‪bool isMultiplePageExtension($extension)
Definition: FileContentParser.php:419