TYPO3 CMS  TYPO3_7-6
FileContentParser.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
20 
26 {
34  public $pdf_mode = -20;
35 
39  public $app = [];
40 
44  public $ext2itemtype_map = [];
45 
49  public $supportedExtensions = [];
50 
54  public $pObj;
55 
59  protected $langObject;
60 
64  public function __construct()
65  {
66  // Set the language object to be used accordant to current TYPO3_MODE:
67  $this->langObject = TYPO3_MODE === 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
68  }
69 
76  public function initParser($extension)
77  {
78  // Then read indexer-config and set if appropriate:
79  $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
80  // If windows, apply extension to tool name:
81  $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
82  // lg
83  $extOK = false;
84  $mainExtension = '';
85  // Ignore extensions
86  $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
87  if (in_array($extension, $ignoreExtensions)) {
88  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
89  return false;
90  }
91  // Switch on file extension:
92  switch ($extension) {
93  case 'pdf':
94  // PDF
95  if ($indexerConfig['pdftools']) {
96  $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
97  if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
98  $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
99  $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
100  // PDF mode:
101  $this->pdf_mode = MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
102  $extOK = true;
103  } else {
104  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
105  }
106  } else {
107  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
108  }
109  break;
110  case 'doc':
111  // Catdoc
112  if ($indexerConfig['catdoc']) {
113  $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
114  if (@is_file(($catdocPath . 'catdoc' . $exe))) {
115  $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
116  $extOK = true;
117  } else {
118  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
119  }
120  } else {
121  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
122  }
123  break;
124  case 'pps':
125  case 'ppt':
126  // MS PowerPoint
127  // ppthtml
128  if ($indexerConfig['ppthtml']) {
129  $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
130  if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
131  $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
132  $extOK = true;
133  } else {
134  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
135  }
136  } else {
137  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
138  }
139  break;
140  case 'xls':
141  // MS Excel
142  // Xlhtml
143  if ($indexerConfig['xlhtml']) {
144  $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
145  if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
146  $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
147  $extOK = true;
148  } else {
149  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
150  }
151  } else {
152  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
153  }
154  break;
155  case 'docx': // Microsoft Word >= 2007
156  case 'dotx':
157  case 'pptx': // Microsoft PowerPoint >= 2007
158  case 'ppsx':
159  case 'potx':
160  case 'xlsx': // Microsoft Excel >= 2007
161  case 'xltx':
162  if ($indexerConfig['unzip']) {
163  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
164  if (@is_file($unzipPath . 'unzip' . $exe)) {
165  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
166  $extOK = true;
167  } else {
168  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
169  }
170  } else {
171  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
172  }
173  break;
174  case 'sxc':
175  case 'sxi':
176  case 'sxw':
177  case 'ods':
178  case 'odp':
179  case 'odt':
180  // Oasis OpenDocument Text
181  if ($indexerConfig['unzip']) {
182  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
183  if (@is_file(($unzipPath . 'unzip' . $exe))) {
184  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
185  $extOK = true;
186  } else {
187  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
188  }
189  } else {
190  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
191  }
192  break;
193  case 'rtf':
194  // Catdoc
195  if ($indexerConfig['unrtf']) {
196  $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
197  if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
198  $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
199  $extOK = true;
200  } else {
201  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
202  }
203  } else {
204  $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
205  }
206  break;
207  case 'txt':
208  case 'csv':
209  case 'xml':
210  case 'tif':
211  // PHP EXIF
212  $extOK = true;
213  break;
214  case 'html':
215  case 'htm':
216  // PHP strip-tags()
217  $extOK = true;
218  $mainExtension = 'html';
219  // making "html" the common "item_type"
220  break;
221  case 'jpg':
222  case 'jpeg':
223  // PHP EXIF
224  $extOK = true;
225  $mainExtension = 'jpeg';
226  // making "jpeg" the common item_type
227  break;
228  }
229  // If extension was OK:
230  if ($extOK) {
231  $this->supportedExtensions[$extension] = true;
232  $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
233  return true;
234  }
235  return false;
236  }
237 
245  public function softInit($extension)
246  {
247  switch ($extension) {
248  case 'pdf':
249  case 'doc':
250  case 'docx':
251  case 'dotx':
252  case 'pps':
253  case 'ppsx':
254  case 'ppt':
255  case 'pptx':
256  case 'potx':
257  case 'xls':
258  case 'xlsx':
259  case 'xltx':
260  case 'sxc':
261  case 'sxi':
262  case 'sxw':
263  case 'ods':
264  case 'odp':
265  case 'odt':
266  case 'rtf':
267  case 'txt':
268  case 'html':
269  case 'htm':
270  case 'csv':
271  case 'xml':
272  case 'jpg':
273  case 'jpeg':
274  case 'tif':
275  // TIF images (EXIF comment)
276  return true;
277  break;
278  }
279  return false;
280  }
281 
288  public function searchTypeMediaTitle($extension)
289  {
290  // Read indexer-config
291  $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
292  // Ignore extensions
293  $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
294  if (in_array($extension, $ignoreExtensions)) {
295  return false;
296  }
297  // Switch on file extension:
298  switch ($extension) {
299  case 'pdf':
300  // PDF
301  if ($indexerConfig['pdftools']) {
302  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
303  }
304  break;
305  case 'doc':
306  // Catdoc
307  if ($indexerConfig['catdoc']) {
308  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
309  }
310  break;
311  case 'pps':
312  case 'ppt':
313  // MS PowerPoint
314  // ppthtml
315  if ($indexerConfig['ppthtml']) {
316  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
317  }
318  break;
319  case 'xls':
320  // MS Excel
321  // Xlhtml
322  if ($indexerConfig['xlhtml']) {
323  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
324  }
325  break;
326  case 'docx':
327  case 'dotx':
328  // Microsoft Word >= 2007
329  if ($indexerConfig['unzip']) {
330  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
331  }
332  break;
333  case 'pptx': // Microsoft PowerPoint >= 2007
334  case 'ppsx':
335  case 'potx':
336  if ($indexerConfig['unzip']) {
337  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
338  }
339  break;
340  case 'xlsx': // Microsoft Excel >= 2007
341  case 'xltx':
342  if ($indexerConfig['unzip']) {
343  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
344  }
345  break;
346  case 'sxc':
347  // Open Office Calc.
348  if ($indexerConfig['unzip']) {
349  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
350  }
351  break;
352  case 'sxi':
353  // Open Office Impress
354  if ($indexerConfig['unzip']) {
355  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
356  }
357  break;
358  case 'sxw':
359  // Open Office Writer
360  if ($indexerConfig['unzip']) {
361  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
362  }
363  break;
364  case 'ods':
365  // Oasis OpenDocument Spreadsheet
366  if ($indexerConfig['unzip']) {
367  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
368  }
369  break;
370  case 'odp':
371  // Oasis OpenDocument Presentation
372  if ($indexerConfig['unzip']) {
373  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
374  }
375  break;
376  case 'odt':
377  // Oasis OpenDocument Text
378  if ($indexerConfig['unzip']) {
379  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
380  }
381  break;
382  case 'rtf':
383  // Catdoc
384  if ($indexerConfig['unrtf']) {
385  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
386  }
387  break;
388  case 'jpeg':
389  case 'jpg':
390  case 'tif':
391  // PHP EXIF
392  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.Images'), $extension);
393  break;
394  case 'html':
395  case 'htm':
396  // PHP strip-tags()
397  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
398  break;
399  case 'txt':
400  // Raw text
401  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
402  break;
403  case 'csv':
404  // Raw text
405  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
406  break;
407  case 'xml':
408  // PHP strip-tags()
409  return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
410  break;
411  default:
412  // Do nothing
413  }
414  return '';
415  }
416 
423  public function isMultiplePageExtension($extension)
424  {
425  // Switch on file extension:
426  switch ((string)$extension) {
427  case 'pdf':
428  return true;
429  break;
430  }
431  return false;
432  }
433 
441  protected function sL($reference, $useHtmlSpecialChar = false)
442  {
443  return $this->langObject->sL($reference, $useHtmlSpecialChar);
444  }
445 
446  /************************
447  *
448  * Reading documents (for parsing)
449  *
450  ************************/
459  public function readFileContent($ext, $absFile, $cPKey)
460  {
461  $contentArr = null;
462  // Return immediately if initialization didn't set support up:
463  if (!$this->supportedExtensions[$ext]) {
464  return false;
465  }
466  // Switch by file extension
467  switch ($ext) {
468  case 'pdf':
469  if ($this->app['pdfinfo']) {
471  // Getting pdf-info:
472  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
473  CommandUtility::exec($cmd, $res);
474  $pdfInfo = $this->splitPdfInfo($res);
475  unset($res);
476  if ((int)$pdfInfo['pages']) {
477  list($low, $high) = explode('-', $cPKey);
478  // Get pdf content:
479  $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
480  // Create temporary name
481  @unlink($tempFileName);
482  // Delete if exists, just to be safe.
483  $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
484  CommandUtility::exec($cmd);
485  if (@is_file($tempFileName)) {
486  $content = GeneralUtility::getUrl($tempFileName);
487  unlink($tempFileName);
488  } else {
489  $content = '';
490  $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
491  }
492  if ((string)$content !== '') {
493  $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
494  }
495  }
496  if (!empty($pdfInfo['title'])) {
497  $contentArr['title'] = $pdfInfo['title'];
498  }
499  $this->setLocaleForServerFileSystem(true);
500  }
501  break;
502  case 'doc':
503  if ($this->app['catdoc']) {
505  $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
506  CommandUtility::exec($cmd, $res);
507  $content = implode(LF, $res);
508  unset($res);
509  $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
510  $this->setLocaleForServerFileSystem(true);
511  }
512  break;
513  case 'pps':
514  case 'ppt':
515  if ($this->app['ppthtml']) {
517  $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
518  CommandUtility::exec($cmd, $res);
519  $content = implode(LF, $res);
520  unset($res);
521  $content = $this->pObj->convertHTMLToUtf8($content);
522  $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
523  $contentArr['title'] = basename($absFile);
524  $this->setLocaleForServerFileSystem(true);
525  }
526  break;
527  case 'xls':
528  if ($this->app['xlhtml']) {
530  $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
531  CommandUtility::exec($cmd, $res);
532  $content = implode(LF, $res);
533  unset($res);
534  $content = $this->pObj->convertHTMLToUtf8($content);
535  $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
536  $contentArr['title'] = basename($absFile);
537  $this->setLocaleForServerFileSystem(true);
538  }
539  break;
540  case 'docx':
541  case 'dotx':
542  case 'pptx':
543  case 'ppsx':
544  case 'potx':
545  case 'xlsx':
546  case 'xltx':
547  if ($this->app['unzip']) {
549  switch ($ext) {
550  case 'docx':
551  case 'dotx':
552  // Read document.xml:
553  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
554  break;
555  case 'ppsx':
556  case 'pptx':
557  case 'potx':
558  // Read slide1.xml:
559  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
560  break;
561  case 'xlsx':
562  case 'xltx':
563  // Read sheet1.xml:
564  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
565  break;
566  }
567  CommandUtility::exec($cmd, $res);
568  $content_xml = implode(LF, $res);
569  unset($res);
570  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
571  $contentArr = $this->pObj->splitRegularContent($utf8_content);
572  // Make sure the title doesn't expose the absolute path!
573  $contentArr['title'] = basename($absFile);
574  // Meta information
575  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
576  CommandUtility::exec($cmd, $res);
577  $meta_xml = implode(LF, $res);
578  unset($res);
579  $metaContent = GeneralUtility::xml2tree($meta_xml);
580  if (is_array($metaContent)) {
581  $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
582  $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
583  $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
584  $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
585  }
586  $this->setLocaleForServerFileSystem(true);
587  }
588  break;
589  case 'sxi':
590  case 'sxc':
591  case 'sxw':
592  case 'ods':
593  case 'odp':
594  case 'odt':
595  if ($this->app['unzip']) {
597  // Read content.xml:
598  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
599  CommandUtility::exec($cmd, $res);
600  $content_xml = implode(LF, $res);
601  unset($res);
602  // Read meta.xml:
603  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
604  CommandUtility::exec($cmd, $res);
605  $meta_xml = implode(LF, $res);
606  unset($res);
607  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
608  $contentArr = $this->pObj->splitRegularContent($utf8_content);
609  $contentArr['title'] = basename($absFile);
610  // Make sure the title doesn't expose the absolute path!
611  // Meta information
612  $metaContent = GeneralUtility::xml2tree($meta_xml);
613  $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
614  if (is_array($metaContent)) {
615  $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
616  $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
617  // Keywords collected:
618  if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
619  foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
620  $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
621  }
622  }
623  }
624  $this->setLocaleForServerFileSystem(true);
625  }
626  break;
627  case 'rtf':
628  if ($this->app['unrtf']) {
630  $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
631  CommandUtility::exec($cmd, $res);
632  $fileContent = implode(LF, $res);
633  unset($res);
634  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
635  $contentArr = $this->pObj->splitHTMLContent($fileContent);
636  $this->setLocaleForServerFileSystem(true);
637  }
638  break;
639  case 'txt':
640  case 'csv':
642  // Raw text
643  $content = GeneralUtility::getUrl($absFile);
644  // @todo Implement auto detection of charset (currently assuming utf-8)
645  $contentCharset = 'utf-8';
646  $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
647  $contentArr = $this->pObj->splitRegularContent($content);
648  $contentArr['title'] = basename($absFile);
649  // Make sure the title doesn't expose the absolute path!
650  $this->setLocaleForServerFileSystem(true);
651  break;
652  case 'html':
653  case 'htm':
654  $fileContent = GeneralUtility::getUrl($absFile);
655  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
656  $contentArr = $this->pObj->splitHTMLContent($fileContent);
657  break;
658  case 'xml':
660  // PHP strip-tags()
661  $fileContent = GeneralUtility::getUrl($absFile);
662  // Finding charset:
663  preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
664  $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
665  // Converting content:
666  $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
667  $contentArr = $this->pObj->splitRegularContent($fileContent);
668  $contentArr['title'] = basename($absFile);
669  // Make sure the title doesn't expose the absolute path!
670  $this->setLocaleForServerFileSystem(true);
671  break;
672  case 'jpg':
673  case 'jpeg':
674  case 'tif':
676  // PHP EXIF
677  if (function_exists('exif_read_data')) {
678  $exif = @exif_read_data($absFile, 'IFD0');
679  } else {
680  $exif = false;
681  }
682  if ($exif) {
683  $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
684  } else {
685  $comment = '';
686  }
687  $contentArr = $this->pObj->splitRegularContent($comment);
688  $contentArr['title'] = basename($absFile);
689  // Make sure the title doesn't expose the absolute path!
690  $this->setLocaleForServerFileSystem(true);
691  break;
692  default:
693  return false;
694  }
695  // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
696  if (is_array($contentArr) && !$contentArr['title']) {
697  // Substituting "_" for " " because many filenames may have this instead of a space char.
698  $contentArr['title'] = str_replace('_', ' ', basename($absFile));
699  }
700  return $contentArr;
701  }
702 
714  protected function setLocaleForServerFileSystem($resetLocale = false)
715  {
716  static $lastLocale = null;
717  if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
718  return;
719  }
720 
721  if ($resetLocale) {
722  if ($lastLocale == null) {
723  throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
724  }
725  setlocale(LC_CTYPE, $lastLocale);
726  $lastLocale = null;
727  } else {
728  if ($lastLocale !== null) {
729  throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
730  }
731  $lastLocale = setlocale(LC_CTYPE, 0);
732  setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
733  }
734  }
735 
746  public function fileContentParts($ext, $absFile)
747  {
748  $cParts = [0];
749  switch ($ext) {
750  case 'pdf':
752  // Getting pdf-info:
753  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
754  CommandUtility::exec($cmd, $res);
755  $pdfInfo = $this->splitPdfInfo($res);
756  unset($res);
757  if ((int)$pdfInfo['pages']) {
758  $cParts = [];
759  // Calculate mode
760  if ($this->pdf_mode > 0) {
761  $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
762  } else {
763  $iter = MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
764  }
765  // Traverse and create intervals.
766  for ($a = 0; $a < $iter; $a++) {
767  $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
768  $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
769  $cParts[] = $low . '-' . $high;
770  }
771  }
772  $this->setLocaleForServerFileSystem(true);
773  break;
774  default:
775  }
776  return $cParts;
777  }
778 
787  public function splitPdfInfo($pdfInfoArray)
788  {
789  $res = [];
790  if (is_array($pdfInfoArray)) {
791  foreach ($pdfInfoArray as $line) {
792  $parts = explode(':', $line, 2);
793  if (count($parts) > 1 && trim($parts[0])) {
794  $res[strtolower(trim($parts[0]))] = trim($parts[1]);
795  }
796  }
797  }
798  return $res;
799  }
800 
807  public function removeEndJunk($string)
808  {
809  return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
810  }
811 
812  /************************
813  *
814  * Backend analyzer
815  *
816  ************************/
823  public function getIcon($extension)
824  {
825  if ($extension === 'htm') {
826  $extension = 'html';
827  } elseif ($extension === 'jpeg') {
828  $extension = 'jpg';
829  }
830  return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
831  }
832 }
static forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:31
sL($reference, $useHtmlSpecialChar=false)
static exec($command, &$output=null, &$returnValue=0)
static trimExplode($delim, $string, $removeEmptyValues=false, $limit=0)
static tempnam($filePrefix, $fileSuffix='')
static xml2tree($string, $depth=999, $parserOptions=[])
static getUrl($url, $includeHeader=0, $requestHeaders=false, &$report=null)
if(TYPO3_MODE==='BE') $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tsfebeuserauth.php']['frontendEditingController']['default']