‪TYPO3CMS  ‪main
RteHtmlParser.php
Go to the documentation of this file.
1 <?php
2 
3 declare(strict_types=1);
4 
5 /*
6  * This file is part of the TYPO3 CMS project.
7  *
8  * It is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU General Public License, either version 2
10  * of the License, or any later version.
11  *
12  * For the full copyright and license information, please read the
13  * LICENSE.txt file that was distributed with this source code.
14  *
15  * The TYPO3 project - inspiring people to share!
16  */
17 
18 namespace ‪TYPO3\CMS\Core\Html;
19 
20 use Psr\EventDispatcher\EventDispatcherInterface;
21 use Psr\Log\LoggerAwareInterface;
22 use Psr\Log\LoggerAwareTrait;
29 use TYPO3\HtmlSanitizer\Builder\BuilderInterface;
30 
39 class ‪RteHtmlParser extends ‪HtmlParser implements LoggerAwareInterface
40 {
41  use LoggerAwareTrait;
42 
46  protected string ‪$blockElementList = 'DIV,TABLE,BLOCKQUOTE,PRE,UL,OL,H1,H2,H3,H4,H5,H6,ADDRESS,DL,DD,HEADER,SECTION,FOOTER,NAV,ARTICLE,ASIDE,FIGURE,FIGCAPTION';
47 
51  protected string ‪$defaultAllowedTagsList = 'b,i,u,a,img,br,div,center,pre,figure,figcaption,font,hr,sub,sup,p,strong,em,li,ul,ol,blockquote,strike,span,abbr,acronym,dfn';
52 
56  protected array ‪$procOptions = [];
57 
61  protected int ‪$TS_transform_db_safecounter = 100;
62 
66  protected array ‪$getKeepTags_cache = [];
67 
71  protected array ‪$allowedClasses = [];
72 
78  'class',
79  'align',
80  'id',
81  'title',
82  'dir',
83  'lang',
84  'xml:lang',
85  'itemscope',
86  'itemtype',
87  'itemprop',
88  ];
89 
96  'address',
97  'article',
98  'aside',
99  'blockquote',
100  'div',
101  'footer',
102  'figure',
103  'figcaption',
104  'header',
105  'hr',
106  'nav',
107  'section',
108  ];
109 
110  public function ‪__construct(
111  protected readonly EventDispatcherInterface $eventDispatcher
112  ) {}
113 
118  protected function ‪setProcessingConfiguration(array $processingConfiguration): void
119  {
120  $this->procOptions = $processingConfiguration;
121  $this->getKeepTags_cache = [];
122 
123  if (isset($this->procOptions['allowedClasses.'])) {
124  $this->allowedClasses = (array)$this->procOptions['allowedClasses.'];
125  } else {
126  $this->allowedClasses = ‪GeneralUtility::trimExplode(',', $this->procOptions['allowedClasses'] ?? '', true);
127  }
128 
129  // Dynamic configuration of blockElementList
130  if (!empty($this->procOptions['blockElementList'])) {
131  $this->blockElementList = $this->procOptions['blockElementList'];
132  }
133 
134  // Define which attributes are allowed on <p> tags
135  if (isset($this->procOptions['allowAttributes.'])) {
136  $this->allowedAttributesForParagraphTags = $this->procOptions['allowAttributes.'];
137  }
138  // Override tags which are allowed outside of <p> tags
139  if (isset($this->procOptions['allowTagsOutside'])) {
140  if (!isset($this->procOptions['allowTagsOutside.'])) {
141  $this->allowedTagsOutsideOfParagraphs = ‪GeneralUtility::trimExplode(',', strtolower($this->procOptions['allowTagsOutside']), true);
142  } else {
143  $this->allowedTagsOutsideOfParagraphs = (array)$this->procOptions['allowTagsOutside.'];
144  }
145  }
146  }
147 
152  public function ‪transformTextForRichTextEditor(string $value, array $processingConfiguration): string
153  {
154  $this->‪setProcessingConfiguration($processingConfiguration);
155  $modes = $this->‪resolveAppliedTransformationModes('rte');
156  $value = $this->‪streamlineLineBreaksForProcessing($value);
157  // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
158  $value = $this->‪runHtmlParserIfConfigured($value, 'entryHTMLparser_rte');
159  // Traverse modes
160  foreach ($modes as $cmd) {
161  switch ($cmd) {
162  case 'detectbrokenlinks':
163  $value = $this->‪markBrokenLinks($value);
164  break;
165  case 'css_transform':
166  $value = $this->‪TS_transform_rte($value);
167  break;
168  default:
169  // Do nothing
170  }
171  }
172  // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
173  $value = $this->‪runHtmlParserIfConfigured($value, 'exitHTMLparser_rte');
174  // Final clean up of linebreaks
175  $value = $this->‪streamlineLineBreaksAfterProcessing($value);
176  return $value;
177  }
178 
182  public function ‪transformTextForPersistence(string $value, array $processingConfiguration): string
183  {
184  $this->‪setProcessingConfiguration($processingConfiguration);
185  $modes = $this->‪resolveAppliedTransformationModes('db');
186  $value = $this->‪streamlineLineBreaksForProcessing($value);
187  // If an entry HTML cleaner was configured, pass the content through the HTMLcleaner
188  $value = $this->‪runHtmlParserIfConfigured($value, 'entryHTMLparser_db');
189  // Traverse modes
190  foreach ($modes as $cmd) {
191  switch ($cmd) {
192  case 'detectbrokenlinks':
193  $value = $this->‪removeBrokenLinkMarkers($value);
194  break;
195  case 'ts_links':
196  $value = $this->‪TS_links_db($value);
197  break;
198  case 'css_transform':
199  // Transform empty paragraphs into spacing paragraphs
200  $value = str_replace('<p></p>', '<p>&nbsp;</p>', $value);
201  // Double any trailing spacing paragraph so that it does not get removed by divideIntoLines()
202  $value = preg_replace('/<p>&nbsp;<\/p>$/', '<p>&nbsp;</p><p>&nbsp;</p>', $value) ?? $value;
203  $value = $this->‪TS_transform_db($value);
204  break;
205  default:
206  // Do nothing
207  }
208  }
209  // process markup with HTML Sanitizer
210  $value = $this->‪htmlSanitize($value, $this->procOptions['HTMLparser_db.'] ?? []);
211  // If an exit HTML cleaner was configured, pass the content through the HTMLcleaner
212  $value = $this->‪runHtmlParserIfConfigured($value, 'exitHTMLparser_db');
213  // Final clean up of linebreaks
214  $value = $this->‪streamlineLineBreaksAfterProcessing($value);
215  return $value;
216  }
217 
223  protected function ‪resolveAppliedTransformationModes(string $direction): array
224  {
225  // Setting modes / transformations to be called
226  if ((string)($this->procOptions['overruleMode'] ?? '') !== '') {
227  $modes = ‪GeneralUtility::trimExplode(',', $this->procOptions['overruleMode']);
228  } else {
229  $modes = [$this->procOptions['mode']];
230  }
231 
232  $modeList = implode(',', $modes);
233 
234  // Replace the shortcut "default" with all custom modes
235  $modeList = str_replace('default', 'detectbrokenlinks,css_transform,ts_links', $modeList);
236 
237  // Make list unique
238  $modes = array_unique(‪GeneralUtility::trimExplode(',', $modeList, true));
239  // Reverse order if direction is "rte"
240  if ($direction === 'rte') {
241  $modes = array_reverse($modes);
242  }
243 
244  return $modes;
245  }
246 
257  protected function ‪runHtmlParserIfConfigured(string $content, string $configurationDirective): string
258  {
259  if (!empty($this->procOptions[$configurationDirective])) {
260  [$keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration] = $this->‪HTMLparserConfig($this->procOptions[$configurationDirective . '.']);
261  $content = $this->‪HTMLcleaner($content, $keepTags, $keepNonMatchedTags, $hscMode, $additionalConfiguration);
262  }
263  return $content;
264  }
265 
266  /************************************
267  *
268  * Specific RTE TRANSFORMATION functions
269  *
270  *************************************/
271 
282  protected function ‪TS_links_db(string $value): string
283  {
284  $blockSplit = $this->‪splitIntoBlock('A', $value);
285  foreach ($blockSplit as $k => $v) {
286  if ($k % 2) {
287  [$tagAttributes] = $this->‪get_tag_attributes($this->‪getFirstTag($v), true);
288 
289  // Anchors would not have an href attribute
290  if (!isset($tagAttributes['href'])) {
291  continue;
292  }
293  $linkService = GeneralUtility::makeInstance(LinkService::class);
294  // Store the link as <a> tag as default by TYPO3, with the link service syntax
295  try {
296  $linkInformation = $linkService->resolve($tagAttributes['href'] ?? '');
297  $tagAttributes['href'] = $linkService->asString($linkInformation);
298  } catch (‪UnknownLinkHandlerException $e) {
299  $tagAttributes['href'] = $linkInformation['href'] ?? $tagAttributes['href'];
300  }
301 
302  $blockSplit[$k] = '<a ' . GeneralUtility::implodeAttributes($tagAttributes, true) . '>'
303  . $this->‪TS_links_db($this->‪removeFirstAndLastTag($blockSplit[$k])) . '</a>';
304  }
305  }
306  return implode('', $blockSplit);
307  }
308 
317  protected function ‪TS_transform_db(string $value): string
318  {
319  // Safety... so forever loops are avoided (they should not occur, but an error would potentially do this...)
320  $this->TS_transform_db_safecounter--;
321  if ($this->TS_transform_db_safecounter < 0) {
322  return $value;
323  }
324  // Split the content from RTE by the occurrence of these blocks:
325  $blockSplit = $this->‪splitIntoBlock($this->blockElementList, $value);
326 
327  // Avoid superfluous linebreaks by transform_db after ending headListTag
328  while (count($blockSplit) > 0 && trim(end($blockSplit)) === '') {
329  array_pop($blockSplit);
330  }
331 
332  // Traverse the blocks
333  foreach ($blockSplit as $k => $v) {
334  if ($k % 2) {
335  // Inside block:
336  // Init:
337  $tag = $this->‪getFirstTag($v);
338  $tagName = strtolower($this->‪getFirstTagName($v));
339  // Process based on the tag:
340  switch ($tagName) {
341  case 'blockquote':
342  case 'dd':
343  case 'div':
344  case 'header':
345  case 'section':
346  case 'footer':
347  case 'nav':
348  case 'article':
349  case 'aside':
350  $blockSplit[$k] = $tag . $this->‪TS_transform_db($this->‪removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
351  break;
352  case 'pre':
353  break;
354  default:
355  // usually <hx> tags and <table> tags where no other block elements are within the tags
356  // Eliminate true linebreaks inside block element tags
357  $blockSplit[$k] = preg_replace('/[' . LF . ']+/', ' ', $blockSplit[$k]);
358  }
359  } else {
360  // NON-block:
361  if (trim($blockSplit[$k]) !== '') {
362  $string = $blockSplit[$k];
363  $string = preg_replace('#<([a-z]+)/>#', '<$1 />', $string);
364  // Remove linebreaks preceding hr tags
365  $string = preg_replace('/[' . LF . ']+<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/', '<$1$2/>', $string) ?? '';
366  // Remove linebreaks following hr tags
367  $string = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>[' . LF . ']+/', '<$1$2/>', $string) ?? '';
368  // Replace other linebreaks with space
369  $string = preg_replace('/[' . LF . ']+/', ' ', $string);
370  // process allowed/removed tags
371  $string = $this->‪HTMLcleaner(
372  (string)$string,
373  $this->‪getKeepTags('db'),
374  $this->procOptions['HTMLparser_db.']['keepNonMatchedTags'] ?? '',
375  (int)($this->procOptions['HTMLparser_db.']['htmlSpecialChars'] ?? 0)
376  );
377  $blockSplit[$k] = (string)$this->‪divideIntoLines($string);
378  } else {
379  unset($blockSplit[$k]);
380  }
381  }
382  }
383  $this->TS_transform_db_safecounter++;
384  return implode(LF, $blockSplit);
385  }
386 
395  protected function ‪TS_transform_rte(string $value): string
396  {
397  // Split the content from database by the occurrence of the block elements
398  $blockSplit = $this->‪splitIntoBlock($this->blockElementList, $value);
399  // Traverse the blocks
400  foreach ($blockSplit as $k => $v) {
401  if ($k % 2) {
402  // Inside one of the blocks:
403  // Init:
404  $tag = $this->‪getFirstTag($v);
405  $tagName = strtolower($this->‪getFirstTagName($v));
406  // Based on tagname, we do transformations:
407  switch ($tagName) {
408  case 'blockquote':
409  case 'dd':
410  case 'div':
411  case 'header':
412  case 'section':
413  case 'footer':
414  case 'nav':
415  case 'article':
416  case 'aside':
417  $blockSplit[$k] = $tag . $this->‪TS_transform_rte($this->‪removeFirstAndLastTag($blockSplit[$k])) . '</' . $tagName . '>';
418  break;
419  }
420  if (!isset($blockSplit[$k + 1])) {
421  $blockSplit[$k + 1] = '';
422  }
423  $blockSplit[$k + 1] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k + 1]);
424  } else {
425  // NON-block:
426  $nextFTN = $this->‪getFirstTagName($blockSplit[$k + 1] ?? '');
427  $onlyLineBreaks = (preg_match('/^[ ]*' . LF . '+[ ]*$/', $blockSplit[$k]) == 1);
428  // If the line is followed by a block or is the last line:
429  if (‪GeneralUtility::inList($this->blockElementList, $nextFTN) || !isset($blockSplit[$k + 1])) {
430  // If the line contains more than just linebreaks, reduce the number of trailing linebreaks by 1
431  if (!$onlyLineBreaks) {
432  $blockSplit[$k] = preg_replace('/(' . LF . '*)' . LF . '[ ]*$/', '$1', $blockSplit[$k]);
433  } else {
434  // If the line contains only linebreaks, remove the leading linebreak
435  $blockSplit[$k] = preg_replace('/^[ ]*' . LF . '/', '', $blockSplit[$k]);
436  }
437  }
438  // If $blockSplit[$k] is blank then unset the line, unless the line only contained linebreaks
439  if ((string)$blockSplit[$k] === '' && !$onlyLineBreaks) {
440  unset($blockSplit[$k]);
441  } else {
442  $blockSplit[$k] = $this->‪setDivTags($blockSplit[$k]);
443  }
444  }
445  }
446  return implode(LF, $blockSplit);
447  }
448 
449  /***************************************************************
450  *
451  * Generic RTE transformation, analysis and helper functions
452  *
453  **************************************************************/
454 
464  protected function ‪HTMLcleaner_db(string $content): string
465  {
466  $keepTags = $this->‪getKeepTags('db');
467  return $this->‪HTMLcleaner($content, $keepTags, false);
468  }
469 
478  protected function ‪getKeepTags(string $direction): array
479  {
480  if (!isset($this->getKeepTags_cache[$direction]) || !is_array($this->getKeepTags_cache[$direction])) {
481  // Setting up allowed tags:
482  // Default is to get allowed/denied tags from internal array of processing options:
483  // Construct default list of tags to keep:
484  if (isset($this->procOptions['allowTags.']) && is_array($this->procOptions['allowTags.'])) {
485  $keepTags = implode(',', $this->procOptions['allowTags.']);
486  } else {
487  $keepTags = $this->procOptions['allowTags'] ?? '';
488  }
489  $keepTags = array_flip(‪GeneralUtility::trimExplode(',', $this->defaultAllowedTagsList . ',' . strtolower($keepTags), true));
490  // For tags to deny, remove them from $keepTags array:
491  $denyTags = ‪GeneralUtility::trimExplode(',', $this->procOptions['denyTags'] ?? '', true);
492  foreach ($denyTags as $dKe) {
493  unset($keepTags[$dKe]);
494  }
495  // Based on the direction of content, set further options:
496  switch ($direction) {
497  case 'rte':
498  // Transforming keepTags array so it can be understood by the HTMLcleaner function.
499  // This basically converts the format of the array from TypoScript (having dots) to plain multi-dimensional array.
500  [$keepTags] = $this->‪HTMLparserConfig($this->procOptions['HTMLparser_rte.'] ?? [], $keepTags);
501  break;
502  case 'db':
503  // Setting up span tags if they are allowed:
504  if (isset($keepTags['span'])) {
505  $keepTags['span'] = [
506  'allowedAttribs' => 'id,class,style,title,lang,xml:lang,dir,itemscope,itemtype,itemprop',
507  'fixAttrib' => [
508  'class' => [
509  'removeIfFalse' => 1,
510  ],
511  ],
512  'rmTagIfNoAttrib' => 1,
513  ];
514  if (!empty($this->allowedClasses)) {
515  $keepTags['span']['fixAttrib']['class']['list'] = ‪$this->allowedClasses;
516  }
517  }
518  // Setting further options, getting them from the processing options
519  $TSc = $this->procOptions['HTMLparser_db.'] ?? [];
520  if (empty($TSc['globalNesting'])) {
521  $TSc['globalNesting'] = 'b,i,u,a,center,font,sub,sup,strong,em,strike,span';
522  }
523  if (empty($TSc['noAttrib'])) {
524  $TSc['noAttrib'] = 'b,i,u,br,center,hr,sub,sup,strong,em,li,ul,ol,blockquote,strike';
525  }
526  // Transforming the array from TypoScript to regular array:
527  [$keepTags] = $this->‪HTMLparserConfig($TSc, $keepTags);
528  break;
529  }
530  // Caching (internally, in object memory) the result
531  $this->getKeepTags_cache[$direction] = $keepTags;
532  }
533  // Return result:
534  return $this->getKeepTags_cache[$direction];
535  }
536 
549  protected function ‪divideIntoLines(string $value, int $count = 5, bool $returnArray = false)
550  {
551  // Setting the third param will eliminate false end-tags. Maybe this is a good thing to do...?
552  $paragraphBlocks = $this->‪splitIntoBlock('p', $value, true);
553  // Returns plainly the content if there was no p sections in it
554  if (count($paragraphBlocks) <= 1 || $count <= 0) {
555  return $this->‪sanitizeLineBreaksForContentOnly($value);
556  }
557 
558  // Traverse the splitted sections
559  foreach ($paragraphBlocks as $k => $v) {
560  if ($k % 2) {
561  // Inside a <p> section
562  $v = $this->‪removeFirstAndLastTag($v);
563  // Fetching 'sub-lines' - which will explode any further p nesting recursively
564  $subLines = $this->‪divideIntoLines($v, $count - 1, true);
565  // So, if there happened to be sub-nesting of p, this is written directly as the new content of THIS section. (This would be considered 'an error')
566  if (is_array($subLines)) {
567  $paragraphBlocks[$k] = implode(LF, $subLines);
568  } else {
569  //... but if NO subsection was found, we process it as a TRUE line without erroneous content:
570  $paragraphBlocks[$k] = $this->‪processContentWithinParagraph($subLines, $paragraphBlocks[$k]);
571  }
572  // If it turns out the line is just blank (containing a &nbsp; possibly) then just make it pure blank.
573  // But, prevent filtering of lines that are blank in sense above, but whose tags contain attributes.
574  // Those attributes should have been filtered before; if they are still there they must be considered as possible content.
575  if (trim(strip_tags($paragraphBlocks[$k])) === '&nbsp;' && !preg_match('/\\<(img)(\\s[^>]*)?\\/?>/si', $paragraphBlocks[$k]) && !preg_match('/\\<([^>]*)?( align| class| style| id| title| dir| lang| xml:lang)([^>]*)?>/si', trim($paragraphBlocks[$k]))) {
576  $paragraphBlocks[$k] = '';
577  }
578  } else {
579  // Outside a paragraph, if there is still something in there, just add a <p> tag
580  // Remove positions which are outside <p> tags and without content
581  $paragraphBlocks[$k] = trim(strip_tags($paragraphBlocks[$k], '<' . implode('><', $this->allowedTagsOutsideOfParagraphs) . '>'));
582  $paragraphBlocks[$k] = $this->‪sanitizeLineBreaksForContentOnly($paragraphBlocks[$k]);
583  if ((string)$paragraphBlocks[$k] === '') {
584  unset($paragraphBlocks[$k]);
585  } else {
586  // add <p> tags around the content
587  $paragraphBlocks[$k] = str_replace(strip_tags($paragraphBlocks[$k]), '<p>' . strip_tags($paragraphBlocks[$k]) . '</p>', $paragraphBlocks[$k]);
588  }
589  }
590  }
591  return $returnArray ? $paragraphBlocks : implode(LF, $paragraphBlocks);
592  }
593 
602  protected function ‪setDivTags(string $value): string
603  {
604  // First, setting configuration for the HTMLcleaner function. This will process each line between the <div>/<p> section on their way to the RTE
605  $keepTags = $this->‪getKeepTags('rte');
606  // Divide the content into lines
607  $parts = explode(LF, $value);
608  foreach ($parts as $k => $v) {
609  // Processing of line content:
610  // If the line is blank, set it to &nbsp;
611  if (trim($parts[$k]) === '') {
612  $parts[$k] = '&nbsp;';
613  } else {
614  // Clean the line content, keeping unknown tags (as they can be removed in the entryHTMLparser)
615  $parts[$k] = $this->‪HTMLcleaner($parts[$k], $keepTags, 'protect');
616  // convert double-encoded &nbsp; into regular &nbsp; however this could also be reversed via the exitHTMLparser
617  // This was previously an option to disable called "dontConvAmpInNBSP_rte"
618  $parts[$k] = str_replace('&amp;nbsp;', '&nbsp;', $parts[$k]);
619  }
620  $partFirstTagName = strtolower($this->‪getFirstTagName($parts[$k] ?? ''));
621  // Wrapping the line in <p> tags if not already wrapped and does not contain an hr tag and is not allowed outside of paragraphs.
622  if (!in_array($partFirstTagName, $this->allowedTagsOutsideOfParagraphs, true) && !preg_match('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', $partFirstTagName)) {
623  $testStr = strtolower(trim($parts[$k]));
624  if (!str_starts_with($testStr, '<div') || !str_ends_with($testStr, '</div>')) {
625  if (!str_starts_with($testStr, '<p') || !str_ends_with($testStr, '</p>')) {
626  // Only set p-tags if there is not already div or p tags:
627  $parts[$k] = '<p>' . $parts[$k] . '</p>';
628  }
629  }
630  }
631  }
632  // Implode result:
633  return implode(LF, $parts);
634  }
635 
648  protected function ‪processContentWithinParagraph(string $content, string $fullContentWithTag): string
649  {
650  // clean up the content
651  $content = $this->‪HTMLcleaner_db($content);
652  // Get the <p> tag, and validate the attributes
653  $fTag = $this->‪getFirstTag($fullContentWithTag);
654  // Check which attributes of the <p> tag to keep attributes
655  if (!empty($this->allowedAttributesForParagraphTags)) {
656  [$tagAttributes] = $this->‪get_tag_attributes($fTag);
657  // Make sure the tag attributes only contain the ones that are defined to be allowed
658  $tagAttributes = array_intersect_key($tagAttributes, array_flip($this->allowedAttributesForParagraphTags));
659 
660  // Only allow classes that are whitelisted in $this->allowedClasses
661  if (isset($tagAttributes['class']) && trim($tagAttributes['class']) !== '' && !empty($this->allowedClasses) && !in_array($tagAttributes['class'], $this->allowedClasses, true)) {
662  $classes = ‪GeneralUtility::trimExplode(' ', $tagAttributes['class'], true);
663  $classes = array_intersect($classes, $this->allowedClasses);
664  if (!empty($classes)) {
665  $tagAttributes['class'] = implode(' ', $classes);
666  } else {
667  unset($tagAttributes['class']);
668  }
669  }
670  } else {
671  $tagAttributes = [];
672  }
673  // Remove any line break
674  $content = str_replace(LF, '', $content);
675  // Compile the surrounding <p> tag
676  $content = '<' . rtrim('p ' . $this->‪compileTagAttribs($tagAttributes)) . '>' . $content . '</p>';
677  return $content;
678  }
679 
685  protected function ‪sanitizeLineBreaksForContentOnly(string $content): string
686  {
687  $content = preg_replace('/<(hr)(\\s[^>\\/]*)?[[:space:]]*\\/?>/i', LF . '<$1$2/>' . LF, $content) ?? $content;
688  $content = str_replace(LF . LF, LF, $content);
689  $content = preg_replace('/(^' . LF . ')|(' . LF . '$)/i', '', $content) ?? $content;
690  return $content;
691  }
692 
703  protected function ‪streamlineLineBreaksForProcessing(string $content): string
704  {
705  return str_replace(CR, '', $content);
706  }
707 
718  protected function ‪streamlineLineBreaksAfterProcessing(string $content): string
719  {
720  // Make sure no \r\n sequences has entered in the meantime
721  $content = $this->‪streamlineLineBreaksForProcessing($content);
722  // ... and then change all \n into \r\n
723  return str_replace(LF, CRLF, $content);
724  }
725 
733  protected function ‪markBrokenLinks(string $content): string
734  {
735  $blocks = $this->‪splitIntoBlock('A', $content);
736  $linkService = GeneralUtility::makeInstance(LinkService::class);
737  foreach ($blocks as $position => $value) {
738  if ($position % 2 === 0) {
739  continue;
740  }
741  [$attributes] = $this->‪get_tag_attributes($this->‪getFirstTag($value), true);
742  if (empty($attributes['href'])) {
743  continue;
744  }
745 
746  try {
747  $hrefInformation = $linkService->resolve($attributes['href']);
748 
749  $brokenLinkAnalysis = new ‪BrokenLinkAnalysisEvent($hrefInformation['type'], $hrefInformation);
750  $this->eventDispatcher->dispatch($brokenLinkAnalysis);
751  if ($brokenLinkAnalysis->isBrokenLink()) {
752  $attributes['data-rte-error'] = $brokenLinkAnalysis->getReason();
753  }
755  // do nothing if user doesn't have access to the file/folder
756  } catch (‪UnknownLinkHandlerException $e) {
757  $attributes['data-rte-error'] = $e->getMessage();
758  }
759 
760  // Always rewrite the block to allow the nested calling even if a page is found
761  $blocks[$position] =
762  '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
763  . $this->‪markBrokenLinks($this->‪removeFirstAndLastTag($blocks[$position]))
764  . '</a>';
765  }
766  return implode('', $blocks);
767  }
768 
776  protected function ‪removeBrokenLinkMarkers(string $content): string
777  {
778  $blocks = $this->‪splitIntoBlock('A', $content);
779  foreach ($blocks as $position => $value) {
780  if ($position % 2 === 0) {
781  continue;
782  }
783  [$attributes] = $this->‪get_tag_attributes($this->‪getFirstTag($value), true);
784  if (empty($attributes['href'])) {
785  continue;
786  }
787  // Always remove the styling again (regardless of the page was found or not)
788  // so the database does not contain ugly stuff
789  unset($attributes['data-rte-error']);
790  if (isset($attributes['style'])) {
791  $attributes['style'] = trim(str_replace('background-color: yellow; border:2px red solid; color: black;', '', $attributes['style']));
792  if (empty($attributes['style'])) {
793  unset($attributes['style']);
794  }
795  }
796  $blocks[$position] =
797  '<a ' . GeneralUtility::implodeAttributes($attributes, true, true) . '>'
798  . $this->‪removeBrokenLinkMarkers($this->‪removeFirstAndLastTag($blocks[$position]))
799  . '</a>';
800  }
801  return implode('', $blocks);
802  }
803 
804  protected function ‪htmlSanitize(string $content, array $configuration): string
805  {
806  $features = GeneralUtility::makeInstance(Features::class);
807  // either `htmlSanitize = null` or `htmlSanitize = false`
808  // or feature flag `security.backend.htmlSanitizeRte` is disabled
809  if (array_key_exists('htmlSanitize', $configuration) && empty($configuration['htmlSanitize'])
810  || !$features->isFeatureEnabled('security.backend.htmlSanitizeRte')
811  ) {
812  return $content;
813  }
814 
815  $build = $configuration['htmlSanitize.']['build'] ?? 'default';
816  if (class_exists($build) && is_a($build, BuilderInterface::class, true)) {
817  $builder = GeneralUtility::makeInstance($build);
818  } else {
819  $factory = GeneralUtility::makeInstance(SanitizerBuilderFactory::class);
820  $builder = $factory->build($build);
821  }
822  $sanitizer = $builder->build();
823  $initiator = GeneralUtility::makeInstance(SanitizerInitiator::class, static::class);
824  return $sanitizer->sanitize($content, $initiator);
825  }
826 }
‪TYPO3\CMS\Core\Html
Definition: DefaultSanitizerBuilder.php:18
‪TYPO3\CMS\Core\Html\HtmlParser\HTMLparserConfig
‪array HTMLparserConfig($TSconfig, $keepTags=[])
Definition: HtmlParser.php:861
‪TYPO3\CMS\Core\Html\RteHtmlParser\removeBrokenLinkMarkers
‪string removeBrokenLinkMarkers(string $content)
Definition: RteHtmlParser.php:776
‪TYPO3\CMS\Core\Html\HtmlParser\getFirstTagName
‪string getFirstTagName($str, $preserveCase=false)
Definition: HtmlParser.php:241
‪TYPO3\CMS\Core\Html\RteHtmlParser\runHtmlParserIfConfigured
‪string runHtmlParserIfConfigured(string $content, string $configurationDirective)
Definition: RteHtmlParser.php:257
‪TYPO3\CMS\Core\Resource\Exception\InsufficientFolderAccessPermissionsException
Definition: InsufficientFolderAccessPermissionsException.php:23
‪TYPO3\CMS\Core\Html\HtmlParser
Definition: HtmlParser.php:26
‪TYPO3\CMS\Core\Html\RteHtmlParser\TS_transform_rte
‪string TS_transform_rte(string $value)
Definition: RteHtmlParser.php:395
‪TYPO3\CMS\Core\Html\RteHtmlParser\$allowedTagsOutsideOfParagraphs
‪array $allowedTagsOutsideOfParagraphs
Definition: RteHtmlParser.php:95
‪TYPO3\CMS\Core\Html\RteHtmlParser\transformTextForRichTextEditor
‪transformTextForRichTextEditor(string $value, array $processingConfiguration)
Definition: RteHtmlParser.php:152
‪TYPO3\CMS\Core\Html\RteHtmlParser\sanitizeLineBreaksForContentOnly
‪string sanitizeLineBreaksForContentOnly(string $content)
Definition: RteHtmlParser.php:685
‪TYPO3\CMS\Core\Html\RteHtmlParser\__construct
‪__construct(protected readonly EventDispatcherInterface $eventDispatcher)
Definition: RteHtmlParser.php:110
‪TYPO3\CMS\Core\Html\RteHtmlParser\htmlSanitize
‪htmlSanitize(string $content, array $configuration)
Definition: RteHtmlParser.php:804
‪TYPO3\CMS\Core\Html\RteHtmlParser\$TS_transform_db_safecounter
‪int $TS_transform_db_safecounter
Definition: RteHtmlParser.php:61
‪TYPO3\CMS\Core\Html\HtmlParser\getFirstTag
‪string getFirstTag($str)
Definition: HtmlParser.php:218
‪TYPO3\CMS\Core\Html\RteHtmlParser\transformTextForPersistence
‪transformTextForPersistence(string $value, array $processingConfiguration)
Definition: RteHtmlParser.php:182
‪TYPO3\CMS\Core\Html\RteHtmlParser\$getKeepTags_cache
‪array $getKeepTags_cache
Definition: RteHtmlParser.php:66
‪TYPO3\CMS\Core\Html\RteHtmlParser\processContentWithinParagraph
‪string processContentWithinParagraph(string $content, string $fullContentWithTag)
Definition: RteHtmlParser.php:648
‪TYPO3\CMS\Core\Html\HtmlParser\get_tag_attributes
‪array get_tag_attributes($tag, $deHSC=false)
Definition: HtmlParser.php:267
‪TYPO3\CMS\Core\Html\RteHtmlParser\HTMLcleaner_db
‪string HTMLcleaner_db(string $content)
Definition: RteHtmlParser.php:464
‪TYPO3\CMS\Core\Html\RteHtmlParser\TS_transform_db
‪string TS_transform_db(string $value)
Definition: RteHtmlParser.php:317
‪TYPO3\CMS\Core\Html\RteHtmlParser\resolveAppliedTransformationModes
‪array resolveAppliedTransformationModes(string $direction)
Definition: RteHtmlParser.php:223
‪TYPO3\CMS\Core\Html\RteHtmlParser\streamlineLineBreaksAfterProcessing
‪string streamlineLineBreaksAfterProcessing(string $content)
Definition: RteHtmlParser.php:718
‪TYPO3\CMS\Core\Html\HtmlParser\compileTagAttribs
‪string compileTagAttribs($tagAttrib, $meta=[])
Definition: HtmlParser.php:839
‪TYPO3\CMS\Core\Configuration\Features
Definition: Features.php:56
‪TYPO3\CMS\Core\Html\RteHtmlParser
Definition: RteHtmlParser.php:40
‪TYPO3\CMS\Core\Html\HtmlParser\HTMLcleaner
‪string HTMLcleaner($content, $tags=[], $keepAll=0, $hSC=0, $addConfig=[])
Definition: HtmlParser.php:385
‪TYPO3\CMS\Core\Html\HtmlParser\removeFirstAndLastTag
‪string removeFirstAndLastTag($str)
Definition: HtmlParser.php:195
‪TYPO3\CMS\Core\Html\RteHtmlParser\markBrokenLinks
‪string markBrokenLinks(string $content)
Definition: RteHtmlParser.php:733
‪TYPO3\CMS\Core\Html\RteHtmlParser\$blockElementList
‪string $blockElementList
Definition: RteHtmlParser.php:46
‪TYPO3\CMS\Core\Html\HtmlParser\splitIntoBlock
‪array splitIntoBlock($tag, $content, $eliminateExtraEndTags=false)
Definition: HtmlParser.php:47
‪TYPO3\CMS\Core\Html\RteHtmlParser\streamlineLineBreaksForProcessing
‪string streamlineLineBreaksForProcessing(string $content)
Definition: RteHtmlParser.php:703
‪TYPO3\CMS\Core\Html\RteHtmlParser\setDivTags
‪string setDivTags(string $value)
Definition: RteHtmlParser.php:602
‪TYPO3\CMS\Core\Html\RteHtmlParser\divideIntoLines
‪string array divideIntoLines(string $value, int $count=5, bool $returnArray=false)
Definition: RteHtmlParser.php:549
‪TYPO3\CMS\Core\Utility\GeneralUtility\inList
‪static bool inList($list, $item)
Definition: GeneralUtility.php:422
‪TYPO3\CMS\Core\Html\RteHtmlParser\setProcessingConfiguration
‪setProcessingConfiguration(array $processingConfiguration)
Definition: RteHtmlParser.php:118
‪TYPO3\CMS\Core\Html\RteHtmlParser\TS_links_db
‪string TS_links_db(string $value)
Definition: RteHtmlParser.php:282
‪TYPO3\CMS\Core\Html\RteHtmlParser\$allowedAttributesForParagraphTags
‪array $allowedAttributesForParagraphTags
Definition: RteHtmlParser.php:77
‪TYPO3\CMS\Core\Html\RteHtmlParser\$defaultAllowedTagsList
‪string $defaultAllowedTagsList
Definition: RteHtmlParser.php:51
‪TYPO3\CMS\Core\Html\RteHtmlParser\$allowedClasses
‪array $allowedClasses
Definition: RteHtmlParser.php:71
‪TYPO3\CMS\Core\Html\RteHtmlParser\$procOptions
‪array $procOptions
Definition: RteHtmlParser.php:56
‪TYPO3\CMS\Core\Utility\GeneralUtility
Definition: GeneralUtility.php:52
‪TYPO3\CMS\Core\Html\RteHtmlParser\getKeepTags
‪array getKeepTags(string $direction)
Definition: RteHtmlParser.php:478
‪TYPO3\CMS\Core\Utility\GeneralUtility\trimExplode
‪static list< string > trimExplode(string $delim, string $string, bool $removeEmptyValues=false, int $limit=0)
Definition: GeneralUtility.php:822