TYPO3CMS  8
 All Classes Namespaces Files Functions Variables Pages
FileContentParser.php
Go to the documentation of this file.
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
20 
26 {
34  public $pdf_mode = -20;
35 
39  public $app = [];
40 
44  public $ext2itemtype_map = [];
45 
49  public $supportedExtensions = [];
50 
54  public $pObj;
55 
59  protected $langObject;
60 
64  public function __construct()
65  {
66  // Set the language object to be used accordant to current TYPO3_MODE:
67  $this->langObject = TYPO3_MODE === 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
68  }
69 
76  public function initParser($extension)
77  {
78  // Then read indexer-config and set if appropriate:
79  $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
80  // If windows, apply extension to tool name:
81  $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
82  // lg
83  $extOK = false;
84  $mainExtension = '';
85  // Ignore extensions
86  $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
87  if (in_array($extension, $ignoreExtensions)) {
88  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
89  return false;
90  }
91  // Switch on file extension:
92  switch ($extension) {
93  case 'pdf':
94  // PDF
95  if ($indexerConfig['pdftools']) {
96  $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
97  if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
98  $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
99  $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
100  // PDF mode:
101  $this->pdf_mode = MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
102  $extOK = true;
103  } else {
104  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
105  }
106  } else {
107  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
108  }
109  break;
110  case 'doc':
111  // Catdoc
112  if ($indexerConfig['catdoc']) {
113  $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
114  if (@is_file(($catdocPath . 'catdoc' . $exe))) {
115  $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
116  $extOK = true;
117  } else {
118  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
119  }
120  } else {
121  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
122  }
123  break;
124  case 'pps':
125  case 'ppt':
126  // MS PowerPoint
127  // ppthtml
128  if ($indexerConfig['ppthtml']) {
129  $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
130  if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
131  $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
132  $extOK = true;
133  } else {
134  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
135  }
136  } else {
137  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
138  }
139  break;
140  case 'xls':
141  // MS Excel
142  // Xlhtml
143  if ($indexerConfig['xlhtml']) {
144  $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
145  if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
146  $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
147  $extOK = true;
148  } else {
149  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
150  }
151  } else {
152  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
153  }
154  break;
155  case 'docx': // Microsoft Word >= 2007
156  case 'dotx':
157  case 'pptx': // Microsoft PowerPoint >= 2007
158  case 'ppsx':
159  case 'potx':
160  case 'xlsx': // Microsoft Excel >= 2007
161  case 'xltx':
162  if ($indexerConfig['unzip']) {
163  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
164  if (@is_file($unzipPath . 'unzip' . $exe)) {
165  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
166  $extOK = true;
167  } else {
168  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
169  }
170  } else {
171  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
172  }
173  break;
174  case 'sxc':
175  case 'sxi':
176  case 'sxw':
177  case 'ods':
178  case 'odp':
179  case 'odt':
180  // Oasis OpenDocument Text
181  if ($indexerConfig['unzip']) {
182  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
183  if (@is_file(($unzipPath . 'unzip' . $exe))) {
184  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
185  $extOK = true;
186  } else {
187  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
188  }
189  } else {
190  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
191  }
192  break;
193  case 'rtf':
194  // Catdoc
195  if ($indexerConfig['unrtf']) {
196  $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
197  if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
198  $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
199  $extOK = true;
200  } else {
201  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
202  }
203  } else {
204  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
205  }
206  break;
207  case 'txt':
208  case 'csv':
209  case 'xml':
210  case 'tif':
211  // PHP EXIF
212  $extOK = true;
213  break;
214  case 'html':
215  case 'htm':
216  // PHP strip-tags()
217  $extOK = true;
218  $mainExtension = 'html';
219  // making "html" the common "item_type"
220  break;
221  case 'jpg':
222  case 'jpeg':
223  // PHP EXIF
224  $extOK = true;
225  $mainExtension = 'jpeg';
226  // making "jpeg" the common item_type
227  break;
228  }
229  // If extension was OK:
230  if ($extOK) {
231  $this->supportedExtensions[$extension] = true;
232  $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
233  return true;
234  }
235  return false;
236  }
237 
245  public function softInit($extension)
246  {
247  switch ($extension) {
248  case 'pdf':
249  case 'doc':
250  case 'docx':
251  case 'dotx':
252  case 'pps':
253  case 'ppsx':
254  case 'ppt':
255  case 'pptx':
256  case 'potx':
257  case 'xls':
258  case 'xlsx':
259  case 'xltx':
260  case 'sxc':
261  case 'sxi':
262  case 'sxw':
263  case 'ods':
264  case 'odp':
265  case 'odt':
266  case 'rtf':
267  case 'txt':
268  case 'html':
269  case 'htm':
270  case 'csv':
271  case 'xml':
272  case 'jpg':
273  case 'jpeg':
274  case 'tif':
275  // TIF images (EXIF comment)
276  return true;
277  break;
278  }
279  return false;
280  }
281 
288  public function searchTypeMediaTitle($extension)
289  {
290  // Read indexer-config
291  $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
292  // Ignore extensions
293  $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
294  if (in_array($extension, $ignoreExtensions)) {
295  return false;
296  }
297  // Switch on file extension:
298  switch ($extension) {
299  case 'pdf':
300  // PDF
301  if ($indexerConfig['pdftools']) {
302  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
303  }
304  break;
305  case 'doc':
306  // Catdoc
307  if ($indexerConfig['catdoc']) {
308  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
309  }
310  break;
311  case 'pps':
312  case 'ppt':
313  // MS PowerPoint
314  // ppthtml
315  if ($indexerConfig['ppthtml']) {
316  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
317  }
318  break;
319  case 'xls':
320  // MS Excel
321  // Xlhtml
322  if ($indexerConfig['xlhtml']) {
323  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
324  }
325  break;
326  case 'docx':
327  case 'dotx':
328  // Microsoft Word >= 2007
329  if ($indexerConfig['unzip']) {
330  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
331  }
332  break;
333  case 'pptx': // Microsoft PowerPoint >= 2007
334  case 'ppsx':
335  case 'potx':
336  if ($indexerConfig['unzip']) {
337  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
338  }
339  break;
340  case 'xlsx': // Microsoft Excel >= 2007
341  case 'xltx':
342  if ($indexerConfig['unzip']) {
343  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
344  }
345  break;
346  case 'sxc':
347  // Open Office Calc.
348  if ($indexerConfig['unzip']) {
349  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
350  }
351  break;
352  case 'sxi':
353  // Open Office Impress
354  if ($indexerConfig['unzip']) {
355  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
356  }
357  break;
358  case 'sxw':
359  // Open Office Writer
360  if ($indexerConfig['unzip']) {
361  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
362  }
363  break;
364  case 'ods':
365  // Oasis OpenDocument Spreadsheet
366  if ($indexerConfig['unzip']) {
367  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
368  }
369  break;
370  case 'odp':
371  // Oasis OpenDocument Presentation
372  if ($indexerConfig['unzip']) {
373  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
374  }
375  break;
376  case 'odt':
377  // Oasis OpenDocument Text
378  if ($indexerConfig['unzip']) {
379  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
380  }
381  break;
382  case 'rtf':
383  // Catdoc
384  if ($indexerConfig['unrtf']) {
385  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
386  }
387  break;
388  case 'jpeg':
389  case 'jpg':
390  case 'tif':
391  // PHP EXIF
392  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.Images'), $extension);
393  break;
394  case 'html':
395  case 'htm':
396  // PHP strip-tags()
397  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
398  break;
399  case 'txt':
400  // Raw text
401  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
402  break;
403  case 'csv':
404  // Raw text
405  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
406  break;
407  case 'xml':
408  // PHP strip-tags()
409  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
410  break;
411  default:
412  // Do nothing
413  }
414  return '';
415  }
416 
423  public function isMultiplePageExtension($extension)
424  {
425  // Switch on file extension:
426  switch ((string)$extension) {
427  case 'pdf':
428  return true;
429  break;
430  }
431  return false;
432  }
433 
440  protected function sL($reference)
441  {
442  return $this->langObject->sL($reference);
443  }
444 
445  /************************
446  *
447  * Reading documents (for parsing)
448  *
449  ************************/
458  public function readFileContent($ext, $absFile, $cPKey)
459  {
460  $contentArr = null;
461  // Return immediately if initialization didn't set support up:
462  if (!$this->supportedExtensions[$ext]) {
463  return false;
464  }
465  // Switch by file extension
466  switch ($ext) {
467  case 'pdf':
468  if ($this->app['pdfinfo']) {
470  // Getting pdf-info:
471  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
472  CommandUtility::exec($cmd, $res);
473  $pdfInfo = $this->splitPdfInfo($res);
474  unset($res);
475  if ((int)$pdfInfo['pages']) {
476  list($low, $high) = explode('-', $cPKey);
477  // Get pdf content:
478  $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
479  // Create temporary name
480  @unlink($tempFileName);
481  // Delete if exists, just to be safe.
482  $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
483  CommandUtility::exec($cmd);
484  if (@is_file($tempFileName)) {
485  $content = file_get_contents($tempFileName);
486  unlink($tempFileName);
487  } else {
488  $content = '';
489  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
490  }
491  if ((string)$content !== '') {
492  $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
493  }
494  }
495  if (!empty($pdfInfo['title'])) {
496  $contentArr['title'] = $pdfInfo['title'];
497  }
498  $this->setLocaleForServerFileSystem(true);
499  }
500  break;
501  case 'doc':
502  if ($this->app['catdoc']) {
504  $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
505  CommandUtility::exec($cmd, $res);
506  $content = implode(LF, $res);
507  unset($res);
508  $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
509  $this->setLocaleForServerFileSystem(true);
510  }
511  break;
512  case 'pps':
513  case 'ppt':
514  if ($this->app['ppthtml']) {
516  $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
517  CommandUtility::exec($cmd, $res);
518  $content = implode(LF, $res);
519  unset($res);
520  $content = $this->pObj->convertHTMLToUtf8($content);
521  $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
522  $contentArr['title'] = basename($absFile);
523  $this->setLocaleForServerFileSystem(true);
524  }
525  break;
526  case 'xls':
527  if ($this->app['xlhtml']) {
529  $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
530  CommandUtility::exec($cmd, $res);
531  $content = implode(LF, $res);
532  unset($res);
533  $content = $this->pObj->convertHTMLToUtf8($content);
534  $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
535  $contentArr['title'] = basename($absFile);
536  $this->setLocaleForServerFileSystem(true);
537  }
538  break;
539  case 'docx':
540  case 'dotx':
541  case 'pptx':
542  case 'ppsx':
543  case 'potx':
544  case 'xlsx':
545  case 'xltx':
546  if ($this->app['unzip']) {
548  switch ($ext) {
549  case 'docx':
550  case 'dotx':
551  // Read document.xml:
552  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
553  break;
554  case 'ppsx':
555  case 'pptx':
556  case 'potx':
557  // Read slide1.xml:
558  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
559  break;
560  case 'xlsx':
561  case 'xltx':
562  // Read sheet1.xml:
563  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
564  break;
565  }
566  CommandUtility::exec($cmd, $res);
567  $content_xml = implode(LF, $res);
568  unset($res);
569  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
570  $contentArr = $this->pObj->splitRegularContent($utf8_content);
571  // Make sure the title doesn't expose the absolute path!
572  $contentArr['title'] = basename($absFile);
573  // Meta information
574  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
575  CommandUtility::exec($cmd, $res);
576  $meta_xml = implode(LF, $res);
577  unset($res);
578  $metaContent = GeneralUtility::xml2tree($meta_xml);
579  if (is_array($metaContent)) {
580  $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
581  $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
582  $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
583  $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
584  }
585  $this->setLocaleForServerFileSystem(true);
586  }
587  break;
588  case 'sxi':
589  case 'sxc':
590  case 'sxw':
591  case 'ods':
592  case 'odp':
593  case 'odt':
594  if ($this->app['unzip']) {
596  // Read content.xml:
597  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
598  CommandUtility::exec($cmd, $res);
599  $content_xml = implode(LF, $res);
600  unset($res);
601  // Read meta.xml:
602  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
603  CommandUtility::exec($cmd, $res);
604  $meta_xml = implode(LF, $res);
605  unset($res);
606  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
607  $contentArr = $this->pObj->splitRegularContent($utf8_content);
608  $contentArr['title'] = basename($absFile);
609  // Make sure the title doesn't expose the absolute path!
610  // Meta information
611  $metaContent = GeneralUtility::xml2tree($meta_xml);
612  $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
613  if (is_array($metaContent)) {
614  $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
615  $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
616  // Keywords collected:
617  if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
618  foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
619  $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
620  }
621  }
622  }
623  $this->setLocaleForServerFileSystem(true);
624  }
625  break;
626  case 'rtf':
627  if ($this->app['unrtf']) {
629  $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
630  CommandUtility::exec($cmd, $res);
631  $fileContent = implode(LF, $res);
632  unset($res);
633  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
634  $contentArr = $this->pObj->splitHTMLContent($fileContent);
635  $this->setLocaleForServerFileSystem(true);
636  }
637  break;
638  case 'txt':
639  case 'csv':
641  // Raw text
642  $content = GeneralUtility::getUrl($absFile);
643  // @todo Implement auto detection of charset (currently assuming utf-8)
644  $contentCharset = 'utf-8';
645  $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
646  $contentArr = $this->pObj->splitRegularContent($content);
647  $contentArr['title'] = basename($absFile);
648  // Make sure the title doesn't expose the absolute path!
649  $this->setLocaleForServerFileSystem(true);
650  break;
651  case 'html':
652  case 'htm':
653  $fileContent = GeneralUtility::getUrl($absFile);
654  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
655  $contentArr = $this->pObj->splitHTMLContent($fileContent);
656  break;
657  case 'xml':
659  // PHP strip-tags()
660  $fileContent = GeneralUtility::getUrl($absFile);
661  // Finding charset:
662  preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
663  $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
664  // Converting content:
665  $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
666  $contentArr = $this->pObj->splitRegularContent($fileContent);
667  $contentArr['title'] = basename($absFile);
668  // Make sure the title doesn't expose the absolute path!
669  $this->setLocaleForServerFileSystem(true);
670  break;
671  case 'jpg':
672  case 'jpeg':
673  case 'tif':
675  // PHP EXIF
676  if (function_exists('exif_read_data')) {
677  $exif = @exif_read_data($absFile, 'IFD0');
678  } else {
679  $exif = false;
680  }
681  if ($exif) {
682  $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
683  } else {
684  $comment = '';
685  }
686  $contentArr = $this->pObj->splitRegularContent($comment);
687  $contentArr['title'] = basename($absFile);
688  // Make sure the title doesn't expose the absolute path!
689  $this->setLocaleForServerFileSystem(true);
690  break;
691  default:
692  return false;
693  }
694  // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
695  if (is_array($contentArr) && !$contentArr['title']) {
696  // Substituting "_" for " " because many filenames may have this instead of a space char.
697  $contentArr['title'] = str_replace('_', ' ', basename($absFile));
698  }
699  return $contentArr;
700  }
701 
713  protected function setLocaleForServerFileSystem($resetLocale = false)
714  {
715  static $lastLocale = null;
716  if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
717  return;
718  }
719 
720  if ($resetLocale) {
721  if ($lastLocale == null) {
722  throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
723  }
724  setlocale(LC_CTYPE, $lastLocale);
725  $lastLocale = null;
726  } else {
727  if ($lastLocale !== null) {
728  throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
729  }
730  $lastLocale = setlocale(LC_CTYPE, 0);
731  setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
732  }
733  }
734 
745  public function fileContentParts($ext, $absFile)
746  {
747  $cParts = [0];
748  switch ($ext) {
749  case 'pdf':
751  // Getting pdf-info:
752  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
753  CommandUtility::exec($cmd, $res);
754  $pdfInfo = $this->splitPdfInfo($res);
755  unset($res);
756  if ((int)$pdfInfo['pages']) {
757  $cParts = [];
758  // Calculate mode
759  if ($this->pdf_mode > 0) {
760  $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
761  } else {
762  $iter = MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
763  }
764  // Traverse and create intervals.
765  for ($a = 0; $a < $iter; $a++) {
766  $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
767  $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
768  $cParts[] = $low . '-' . $high;
769  }
770  }
771  $this->setLocaleForServerFileSystem(true);
772  break;
773  default:
774  }
775  return $cParts;
776  }
777 
786  public function splitPdfInfo($pdfInfoArray)
787  {
788  $res = [];
789  if (is_array($pdfInfoArray)) {
790  foreach ($pdfInfoArray as $line) {
791  $parts = explode(':', $line, 2);
792  if (count($parts) > 1 && trim($parts[0])) {
793  $res[strtolower(trim($parts[0]))] = trim($parts[1]);
794  }
795  }
796  }
797  return $res;
798  }
799 
806  public function removeEndJunk($string)
807  {
808  return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
809  }
810 
811  /************************
812  *
813  * Backend analyzer
814  *
815  ************************/
822  public function getIcon($extension)
823  {
824  if ($extension === 'htm') {
825  $extension = 'html';
826  } elseif ($extension === 'jpeg') {
827  $extension = 'jpg';
828  }
829  return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
830  }
831 }
static trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
static tempnam($filePrefix, $fileSuffix= '')
static exec($command, &$output=null, &$returnValue=0)
static xml2tree($string, $depth=999, $parserOptions=[])
if(TYPO3_MODE=== 'BE') $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tsfebeuserauth.php']['frontendEditingController']['default']
static forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:31