‪TYPO3CMS  9.5
FileContentParser.php
Go to the documentation of this file.
1 <?php
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
23 
29 {
37  public ‪$pdf_mode = -20;
38 
42  public ‪$app = [];
43 
47  public ‪$ext2itemtype_map = [];
48 
53 
57  public ‪$pObj;
58 
62  protected ‪$langObject;
63 
67  protected ‪$lastLocale;
68 
72  public function ‪__construct()
73  {
74  // Set the language object to be used accordant to current TYPO3_MODE:
75  $this->langObject = TYPO3_MODE === 'FE' ? ‪$GLOBALS['TSFE'] : ‪$GLOBALS['LANG'];
76  }
77 
84  public function ‪initParser($extension)
85  {
86  // Then read indexer-config and set if appropriate:
87  $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
88  // If windows, apply extension to tool name:
89  $exe = ‪Environment::isWindows() ? '.exe' : '';
90  // lg
91  $extOK = false;
92  $mainExtension = '';
93  // Ignore extensions
94  $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
95  if (in_array($extension, $ignoreExtensions)) {
96  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
97  return false;
98  }
99  // Switch on file extension:
100  switch ($extension) {
101  case 'pdf':
102  // PDF
103  if ($indexerConfig['pdftools']) {
104  $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
105  if (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe)) {
106  $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
107  $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
108  // PDF mode:
109  $this->pdf_mode = ‪MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
110  $extOK = true;
111  } else {
112  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
113  }
114  } else {
115  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
116  }
117  break;
118  case 'doc':
119  // Catdoc
120  if ($indexerConfig['catdoc']) {
121  $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
122  if (@is_file($catdocPath . 'catdoc' . $exe)) {
123  $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
124  $extOK = true;
125  } else {
126  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
127  }
128  } else {
129  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
130  }
131  break;
132  case 'pps':
133  case 'ppt':
134  // MS PowerPoint
135  // ppthtml
136  if ($indexerConfig['ppthtml']) {
137  $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
138  if (@is_file($ppthtmlPath . 'ppthtml' . $exe)) {
139  $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
140  $extOK = true;
141  } else {
142  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
143  }
144  } else {
145  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
146  }
147  break;
148  case 'xls':
149  // MS Excel
150  // Xlhtml
151  if ($indexerConfig['xlhtml']) {
152  $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
153  if (@is_file($xlhtmlPath . 'xlhtml' . $exe)) {
154  $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
155  $extOK = true;
156  } else {
157  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
158  }
159  } else {
160  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
161  }
162  break;
163  case 'docx': // Microsoft Word >= 2007
164  case 'dotx':
165  case 'pptx': // Microsoft PowerPoint >= 2007
166  case 'ppsx':
167  case 'potx':
168  case 'xlsx': // Microsoft Excel >= 2007
169  case 'xltx':
170  if ($indexerConfig['unzip']) {
171  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
172  if (@is_file($unzipPath . 'unzip' . $exe)) {
173  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
174  $extOK = true;
175  } else {
176  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
177  }
178  } else {
179  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
180  }
181  break;
182  case 'sxc':
183  case 'sxi':
184  case 'sxw':
185  case 'ods':
186  case 'odp':
187  case 'odt':
188  // Oasis OpenDocument Text
189  if ($indexerConfig['unzip']) {
190  $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
191  if (@is_file($unzipPath . 'unzip' . $exe)) {
192  $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
193  $extOK = true;
194  } else {
195  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
196  }
197  } else {
198  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
199  }
200  break;
201  case 'rtf':
202  // Catdoc
203  if ($indexerConfig['unrtf']) {
204  $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
205  if (@is_file($unrtfPath . 'unrtf' . $exe)) {
206  $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
207  $extOK = true;
208  } else {
209  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
210  }
211  } else {
212  $this->pObj->log_setTSlogMessage($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
213  }
214  break;
215  case 'txt':
216  case 'csv':
217  case 'xml':
218  case 'tif':
219  // PHP EXIF
220  $extOK = true;
221  break;
222  case 'html':
223  case 'htm':
224  // PHP strip-tags()
225  $extOK = true;
226  $mainExtension = 'html';
227  // making "html" the common "item_type"
228  break;
229  case 'jpg':
230  case 'jpeg':
231  // PHP EXIF
232  $extOK = true;
233  $mainExtension = 'jpeg';
234  // making "jpeg" the common item_type
235  break;
236  }
237  // If extension was OK:
238  if ($extOK) {
239  $this->supportedExtensions[$extension] = true;
240  $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
241  return true;
242  }
243  return false;
244  }
245 
253  public function ‪softInit($extension)
254  {
255  switch ($extension) {
256  case 'pdf':
257  case 'doc':
258  case 'docx':
259  case 'dotx':
260  case 'pps':
261  case 'ppsx':
262  case 'ppt':
263  case 'pptx':
264  case 'potx':
265  case 'xls':
266  case 'xlsx':
267  case 'xltx':
268  case 'sxc':
269  case 'sxi':
270  case 'sxw':
271  case 'ods':
272  case 'odp':
273  case 'odt':
274  case 'rtf':
275  case 'txt':
276  case 'html':
277  case 'htm':
278  case 'csv':
279  case 'xml':
280  case 'jpg':
281  case 'jpeg':
282  case 'tif':
283  // TIF images (EXIF comment)
284  return true;
285  }
286  return false;
287  }
288 
295  public function ‪searchTypeMediaTitle($extension)
296  {
297  // Read indexer-config
298  $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
299  // Ignore extensions
300  $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
301  if (in_array($extension, $ignoreExtensions)) {
302  return false;
303  }
304  // Switch on file extension:
305  switch ($extension) {
306  case 'pdf':
307  // PDF
308  if ($indexerConfig['pdftools']) {
309  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
310  }
311  break;
312  case 'doc':
313  // Catdoc
314  if ($indexerConfig['catdoc']) {
315  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
316  }
317  break;
318  case 'pps':
319  case 'ppt':
320  // MS PowerPoint
321  // ppthtml
322  if ($indexerConfig['ppthtml']) {
323  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
324  }
325  break;
326  case 'xls':
327  // MS Excel
328  // Xlhtml
329  if ($indexerConfig['xlhtml']) {
330  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
331  }
332  break;
333  case 'docx':
334  case 'dotx':
335  // Microsoft Word >= 2007
336  if ($indexerConfig['unzip']) {
337  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
338  }
339  break;
340  case 'pptx': // Microsoft PowerPoint >= 2007
341  case 'ppsx':
342  case 'potx':
343  if ($indexerConfig['unzip']) {
344  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
345  }
346  break;
347  case 'xlsx': // Microsoft Excel >= 2007
348  case 'xltx':
349  if ($indexerConfig['unzip']) {
350  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
351  }
352  break;
353  case 'sxc':
354  // Open Office Calc.
355  if ($indexerConfig['unzip']) {
356  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
357  }
358  break;
359  case 'sxi':
360  // Open Office Impress
361  if ($indexerConfig['unzip']) {
362  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
363  }
364  break;
365  case 'sxw':
366  // Open Office Writer
367  if ($indexerConfig['unzip']) {
368  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
369  }
370  break;
371  case 'ods':
372  // Oasis OpenDocument Spreadsheet
373  if ($indexerConfig['unzip']) {
374  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
375  }
376  break;
377  case 'odp':
378  // Oasis OpenDocument Presentation
379  if ($indexerConfig['unzip']) {
380  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
381  }
382  break;
383  case 'odt':
384  // Oasis OpenDocument Text
385  if ($indexerConfig['unzip']) {
386  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
387  }
388  break;
389  case 'rtf':
390  // Catdoc
391  if ($indexerConfig['unrtf']) {
392  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
393  }
394  break;
395  case 'jpeg':
396  case 'jpg':
397  case 'tif':
398  // PHP EXIF
399  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.images'), $extension);
400  case 'html':
401  case 'htm':
402  // PHP strip-tags()
403  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
404  case 'txt':
405  // Raw text
406  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
407  case 'csv':
408  // Raw text
409  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
410  case 'xml':
411  // PHP strip-tags()
412  return sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
413  default:
414  // Do nothing
415  }
416  return '';
417  }
418 
425  public function ‪isMultiplePageExtension($extension)
426  {
427  // Switch on file extension:
428  switch ((string)$extension) {
429  case 'pdf':
430  return true;
431  }
432  return false;
433  }
434 
441  protected function ‪sL($reference)
442  {
443  return $this->langObject->sL($reference);
444  }
445 
446  /************************
447  *
448  * Reading documents (for parsing)
449  *
450  ************************/
459  public function ‪readFileContent($ext, $absFile, $cPKey)
460  {
461  $contentArr = null;
462  // Return immediately if initialization didn't set support up:
463  if (!$this->supportedExtensions[$ext]) {
464  return false;
465  }
466  // Switch by file extension
467  switch ($ext) {
468  case 'pdf':
469  if ($this->app['pdfinfo']) {
471  // Getting pdf-info:
472  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
473  ‪CommandUtility::exec($cmd, $res);
474  $pdfInfo = $this->‪splitPdfInfo($res);
475  unset($res);
476  if ((int)$pdfInfo['pages']) {
477  list($low, $high) = explode('-', $cPKey);
478  // Get pdf content:
479  $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
480  // Create temporary name
481  @unlink($tempFileName);
482  // Delete if exists, just to be safe.
483  $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
485  if (@is_file($tempFileName)) {
486  $content = file_get_contents($tempFileName);
487  unlink($tempFileName);
488  } else {
489  $content = '';
490  $this->pObj->log_setTSlogMessage(sprintf($this->‪sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
491  }
492  if ((string)$content !== '') {
493  $contentArr = $this->pObj->splitRegularContent($this->‪removeEndJunk($content));
494  }
495  }
496  if (!empty($pdfInfo['title'])) {
497  $contentArr['title'] = $pdfInfo['title'];
498  }
499  $this->‪setLocaleForServerFileSystem(true);
500  }
501  break;
502  case 'doc':
503  if ($this->app['catdoc']) {
505  $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
506  ‪CommandUtility::exec($cmd, $res);
507  $content = implode(LF, $res);
508  unset($res);
509  $contentArr = $this->pObj->splitRegularContent($this->‪removeEndJunk($content));
510  $this->‪setLocaleForServerFileSystem(true);
511  }
512  break;
513  case 'pps':
514  case 'ppt':
515  if ($this->app['ppthtml']) {
517  $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
518  ‪CommandUtility::exec($cmd, $res);
519  $content = implode(LF, $res);
520  unset($res);
521  $content = $this->pObj->convertHTMLToUtf8($content);
522  $contentArr = $this->pObj->splitHTMLContent($this->‪removeEndJunk($content));
523  $contentArr['title'] = ‪PathUtility::basename($absFile);
524  $this->‪setLocaleForServerFileSystem(true);
525  }
526  break;
527  case 'xls':
528  if ($this->app['xlhtml']) {
530  $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
531  ‪CommandUtility::exec($cmd, $res);
532  $content = implode(LF, $res);
533  unset($res);
534  $content = $this->pObj->convertHTMLToUtf8($content);
535  $contentArr = $this->pObj->splitHTMLContent($this->‪removeEndJunk($content));
536  $contentArr['title'] = ‪PathUtility::basename($absFile);
537  $this->‪setLocaleForServerFileSystem(true);
538  }
539  break;
540  case 'docx':
541  case 'dotx':
542  case 'pptx':
543  case 'ppsx':
544  case 'potx':
545  case 'xlsx':
546  case 'xltx':
547  if ($this->app['unzip']) {
549  switch ($ext) {
550  case 'docx':
551  case 'dotx':
552  // Read document.xml:
553  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
554  break;
555  case 'ppsx':
556  case 'pptx':
557  case 'potx':
558  // Read slide1.xml:
559  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
560  break;
561  case 'xlsx':
562  case 'xltx':
563  // Read sheet1.xml:
564  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
565  break;
566  }
567  ‪CommandUtility::exec($cmd, $res);
568  $content_xml = implode(LF, $res);
569  unset($res);
570  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
571  $contentArr = $this->pObj->splitRegularContent($utf8_content);
572  // Make sure the title doesn't expose the absolute path!
573  $contentArr['title'] = ‪PathUtility::basename($absFile);
574  // Meta information
575  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
576  ‪CommandUtility::exec($cmd, $res);
577  $meta_xml = implode(LF, $res);
578  unset($res);
579  $metaContent = GeneralUtility::xml2tree($meta_xml);
580  if (is_array($metaContent)) {
581  $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
582  $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
583  $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
584  $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
585  }
586  $this->‪setLocaleForServerFileSystem(true);
587  }
588  break;
589  case 'sxi':
590  case 'sxc':
591  case 'sxw':
592  case 'ods':
593  case 'odp':
594  case 'odt':
595  if ($this->app['unzip']) {
597  // Read content.xml:
598  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
599  ‪CommandUtility::exec($cmd, $res);
600  $content_xml = implode(LF, $res);
601  unset($res);
602  // Read meta.xml:
603  $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
604  ‪CommandUtility::exec($cmd, $res);
605  $meta_xml = implode(LF, $res);
606  unset($res);
607  $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
608  $contentArr = $this->pObj->splitRegularContent($utf8_content);
609  $contentArr['title'] = ‪PathUtility::basename($absFile);
610  // Make sure the title doesn't expose the absolute path!
611  // Meta information
612  $metaContent = GeneralUtility::xml2tree($meta_xml);
613  $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
614  if (is_array($metaContent)) {
615  $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
616  $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
617  // Keywords collected:
618  if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
619  foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
620  $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
621  }
622  }
623  }
624  $this->‪setLocaleForServerFileSystem(true);
625  }
626  break;
627  case 'rtf':
628  if ($this->app['unrtf']) {
630  $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
631  ‪CommandUtility::exec($cmd, $res);
632  $fileContent = implode(LF, $res);
633  unset($res);
634  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
635  $contentArr = $this->pObj->splitHTMLContent($fileContent);
636  $this->‪setLocaleForServerFileSystem(true);
637  }
638  break;
639  case 'txt':
640  case 'csv':
642  // Raw text
643  $content = GeneralUtility::getUrl($absFile);
644  // @todo Implement auto detection of charset (currently assuming utf-8)
645  $contentCharset = 'utf-8';
646  $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
647  $contentArr = $this->pObj->splitRegularContent($content);
648  $contentArr['title'] = ‪PathUtility::basename($absFile);
649  // Make sure the title doesn't expose the absolute path!
650  $this->‪setLocaleForServerFileSystem(true);
651  break;
652  case 'html':
653  case 'htm':
654  $fileContent = GeneralUtility::getUrl($absFile);
655  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
656  $contentArr = $this->pObj->splitHTMLContent($fileContent);
657  break;
658  case 'xml':
660  // PHP strip-tags()
661  $fileContent = GeneralUtility::getUrl($absFile);
662  // Finding charset:
663  preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
664  $charset = $reg[1] ? trim(strtolower($reg[1])) : 'utf-8';
665  // Converting content:
666  $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
667  $contentArr = $this->pObj->splitRegularContent($fileContent);
668  $contentArr['title'] = ‪PathUtility::basename($absFile);
669  // Make sure the title doesn't expose the absolute path!
670  $this->‪setLocaleForServerFileSystem(true);
671  break;
672  case 'jpg':
673  case 'jpeg':
674  case 'tif':
676  // PHP EXIF
677  if (function_exists('exif_read_data')) {
678  $exif = @exif_read_data($absFile, 'IFD0');
679  } else {
680  $exif = false;
681  }
682  if ($exif) {
683  $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
684  } else {
685  $comment = '';
686  }
687  $contentArr = $this->pObj->splitRegularContent($comment);
688  $contentArr['title'] = ‪PathUtility::basename($absFile);
689  // Make sure the title doesn't expose the absolute path!
690  $this->‪setLocaleForServerFileSystem(true);
691  break;
692  default:
693  return false;
694  }
695  // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
696  if (is_array($contentArr) && !$contentArr['title']) {
697  // Substituting "_" for " " because many filenames may have this instead of a space char.
698  $contentArr['title'] = str_replace('_', ' ', ‪PathUtility::basename($absFile));
699  }
700  return $contentArr;
701  }
702 
713  protected function ‪setLocaleForServerFileSystem($resetLocale = false)
714  {
715  if (!‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
716  return;
717  }
718 
719  if ($resetLocale) {
720  if ($this->lastLocale == null) {
721  throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
722  }
723  setlocale(LC_CTYPE, $this->lastLocale);
724  $this->lastLocale = null;
725  } else {
726  if ($this->lastLocale !== null) {
727  throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
728  }
729  $this->lastLocale = setlocale(LC_CTYPE, 0);
730  setlocale(LC_CTYPE, ‪$GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
731  }
732  }
733 
744  public function ‪fileContentParts($ext, $absFile)
745  {
746  $cParts = [0];
747  switch ($ext) {
748  case 'pdf':
750  // Getting pdf-info:
751  $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
752  ‪CommandUtility::exec($cmd, $res);
753  $pdfInfo = $this->‪splitPdfInfo($res);
754  unset($res);
755  if ((int)$pdfInfo['pages']) {
756  $cParts = [];
757  // Calculate mode
758  if ($this->pdf_mode > 0) {
759  $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
760  } else {
761  $iter = ‪MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
762  }
763  // Traverse and create intervals.
764  for ($a = 0; $a < $iter; $a++) {
765  $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
766  $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
767  $cParts[] = $low . '-' . $high;
768  }
769  }
770  $this->‪setLocaleForServerFileSystem(true);
771  break;
772  default:
773  }
774  return $cParts;
775  }
776 
785  public function ‪splitPdfInfo($pdfInfoArray)
786  {
787  $res = [];
788  if (is_array($pdfInfoArray)) {
789  foreach ($pdfInfoArray as $line) {
790  $parts = explode(':', $line, 2);
791  if (count($parts) > 1 && trim($parts[0])) {
792  $res[strtolower(trim($parts[0]))] = trim($parts[1]);
793  }
794  }
795  }
796  return $res;
797  }
798 
805  public function ‪removeEndJunk($string)
806  {
807  return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
808  }
809 
810  /************************
811  *
812  * Backend analyzer
813  *
814  ************************/
821  public function ‪getIcon($extension)
822  {
823  if ($extension === 'htm') {
824  $extension = 'html';
825  } elseif ($extension === 'jpeg') {
826  $extension = 'jpg';
827  }
828  return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
829  }
830 }
‪TYPO3\CMS\IndexedSearch\FileContentParser\softInit
‪bool softInit($extension)
Definition: FileContentParser.php:246
‪TYPO3\CMS\IndexedSearch\FileContentParser\removeEndJunk
‪string removeEndJunk($string)
Definition: FileContentParser.php:798
‪TYPO3\CMS\IndexedSearch\FileContentParser\$app
‪array $app
Definition: FileContentParser.php:40
‪TYPO3\CMS\IndexedSearch
‪TYPO3\CMS\Core\Utility\PathUtility
Definition: PathUtility.php:23
‪TYPO3\CMS\IndexedSearch\FileContentParser\splitPdfInfo
‪array splitPdfInfo($pdfInfoArray)
Definition: FileContentParser.php:778
‪TYPO3\CMS\Core\Configuration\ExtensionConfiguration
Definition: ExtensionConfiguration.php:42
‪TYPO3\CMS\IndexedSearch\FileContentParser\setLocaleForServerFileSystem
‪setLocaleForServerFileSystem($resetLocale=false)
Definition: FileContentParser.php:706
‪TYPO3\CMS\Core\Utility\MathUtility\forceIntegerInRange
‪static int forceIntegerInRange($theInt, $min, $max=2000000000, $defaultValue=0)
Definition: MathUtility.php:31
‪TYPO3\CMS\IndexedSearch\FileContentParser\$pdf_mode
‪int $pdf_mode
Definition: FileContentParser.php:36
‪TYPO3\CMS\Core\Core\Environment\isWindows
‪static bool isWindows()
Definition: Environment.php:266
‪TYPO3\CMS\IndexedSearch\FileContentParser\sL
‪string sL($reference)
Definition: FileContentParser.php:434
‪TYPO3\CMS\IndexedSearch\FileContentParser\$pObj
‪TYPO3 CMS IndexedSearch Indexer $pObj
Definition: FileContentParser.php:52
‪TYPO3\CMS\IndexedSearch\FileContentParser\$langObject
‪TYPO3 CMS Core Localization LanguageService TYPO3 CMS Frontend Controller TypoScriptFrontendController $langObject
Definition: FileContentParser.php:56
‪TYPO3\CMS\IndexedSearch\FileContentParser\initParser
‪bool initParser($extension)
Definition: FileContentParser.php:77
‪TYPO3\CMS\Core\Utility\PathUtility\basename
‪static string basename($path)
Definition: PathUtility.php:164
‪TYPO3\CMS\IndexedSearch\FileContentParser\fileContentParts
‪array fileContentParts($ext, $absFile)
Definition: FileContentParser.php:737
‪TYPO3\CMS\IndexedSearch\FileContentParser\$supportedExtensions
‪array $supportedExtensions
Definition: FileContentParser.php:48
‪TYPO3\CMS\Core\Utility\CommandUtility\exec
‪static string exec($command, &$output=null, &$returnValue=0)
Definition: CommandUtility.php:80
‪TYPO3\CMS\IndexedSearch\FileContentParser
Definition: FileContentParser.php:29
‪TYPO3\CMS\IndexedSearch\FileContentParser\__construct
‪__construct()
Definition: FileContentParser.php:65
‪TYPO3\CMS\IndexedSearch\FileContentParser\getIcon
‪string getIcon($extension)
Definition: FileContentParser.php:814
‪TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
Definition: TypoScriptFrontendController.php:97
‪$GLOBALS
‪$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['adminpanel']['modules']
Definition: ext_localconf.php:5
‪TYPO3\CMS\IndexedSearch\FileContentParser\searchTypeMediaTitle
‪string searchTypeMediaTitle($extension)
Definition: FileContentParser.php:288
‪TYPO3\CMS\Core\Core\Environment
Definition: Environment.php:39
‪TYPO3\CMS\IndexedSearch\FileContentParser\$ext2itemtype_map
‪array $ext2itemtype_map
Definition: FileContentParser.php:44
‪TYPO3\CMS\Core\Utility\MathUtility
Definition: MathUtility.php:21
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:45
‪TYPO3\CMS\IndexedSearch\FileContentParser\readFileContent
‪array readFileContent($ext, $absFile, $cPKey)
Definition: FileContentParser.php:452
‪TYPO3\CMS\Core\Utility\CommandUtility
Definition: CommandUtility.php:48
‪TYPO3\CMS\IndexedSearch\FileContentParser\$lastLocale
‪string $lastLocale
Definition: FileContentParser.php:60
‪TYPO3\CMS\IndexedSearch\FileContentParser\isMultiplePageExtension
‪bool isMultiplePageExtension($extension)
Definition: FileContentParser.php:418