‪TYPO3CMS  ‪main
HtmlCropper.php
Go to the documentation of this file.
1 <?php
2 
3 declare(strict_types=1);
4 
5 /*
6  * This file is part of the TYPO3 CMS project.
7  *
8  * It is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU General Public License, either version 2
10  * of the License, or any later version.
11  *
12  * For the full copyright and license information, please read the
13  * LICENSE.txt file that was distributed with this source code.
14  *
15  * The TYPO3 project - inspiring people to share!
16  */
17 
18 namespace ‪TYPO3\CMS\Core\Html;
19 
20 use Psr\Log\LoggerAwareInterface;
21 use Psr\Log\LoggerAwareTrait;
22 
23 class ‪HtmlCropper implements LoggerAwareInterface
24 {
25  use LoggerAwareTrait;
26 
37  public function ‪crop(string $content, int $numberOfChars, string $replacementForEllipsis, bool $cropToSpace): string
38  {
39  $cropFromRight = $numberOfChars < 0;
40  $absChars = abs($numberOfChars);
41  $replacementForEllipsis = trim($replacementForEllipsis);
42  // Split $content into an array(even items in the array are outside the tags, odd numbers are tag-blocks).
43  $tags = 'a|abbr|address|area|article|aside|audio|b|bdi|bdo|blockquote|body|br|button|caption|cite|code|col|colgroup|data|datalist|dd|del|dfn|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|h1|h2|h3|h4|h5|h6|header|hr|i|iframe|img|input|ins|kbd|keygen|label|legend|li|link|main|map|mark|meter|nav|object|ol|optgroup|option|output|p|param|pre|progress|q|rb|rp|rt|rtc|ruby|s|samp|section|select|small|source|span|strong|sub|sup|table|tbody|td|textarea|tfoot|th|thead|time|tr|track|u|ul|ut|var|video|wbr';
44  $tagsRegEx = '
45  (
46  (?:
47  <!--.*?--> # a comment
48  |
49  <canvas[^>]*>.*?</canvas> # a canvas tag
50  |
51  <script[^>]*>.*?</script> # a script tag
52  |
53  <noscript[^>]*>.*?</noscript> # a noscript tag
54  |
55  <template[^>]*>.*?</template> # a template tag
56  )
57  |
58  </?(?:' . $tags . ')+ # opening tag (\'<tag\') or closing tag (\'</tag\')
59  (?:
60  (?:
61  (?:
62  \\s+\\w[\\w-]* # EITHER spaces, followed by attribute names
63  (?:
64  \\s*=?\\s* # equals
65  (?>
66  ".*?" # attribute values in double-quotes
67  |
68  \'.*?\' # attribute values in single-quotes
69  |
70  [^\'">\\s]+ # plain attribute values
71  )
72  )?
73  )
74  | # OR a single dash (for TYPO3 link tag)
75  (?:
76  \\s+-
77  )
78  )+\\s*
79  | # OR only spaces
80  \\s*
81  )
82  /?> # closing the tag with \'>\' or \'/>\'
83  )';
84  $splittedContent = preg_split('%' . $tagsRegEx . '%xs', $content, -1, PREG_SPLIT_DELIM_CAPTURE);
85  if ($splittedContent === false) {
86  $this->logger->debug('Unable to split "{content}" into tags.', ['content' => $content]);
87  $splittedContent = [];
88  }
89 
90  // Reverse array if we are cropping from right.
91  if ($cropFromRight) {
92  $splittedContent = array_reverse($splittedContent);
93  }
94  // Crop the text (chars of tag-blocks are not counted).
95  $strLen = 0;
96  // This is the offset of the content item which was cropped.
97  $croppedOffset = null;
98  $countSplittedContent = count($splittedContent);
99  for ($offset = 0; $offset < $countSplittedContent; $offset++) {
100  if ($offset % 2 === 0) {
101  $fullTempContent = $splittedContent[$offset];
102  $thisStrLen = mb_strlen(html_entity_decode($fullTempContent, ENT_COMPAT, 'UTF-8'), 'utf-8');
103  if ($strLen + $thisStrLen > $absChars) {
104  $tempProcessedContent = '';
105  $croppedOffset = $offset;
106  $cropPosition = $absChars - $strLen;
107  // The snippet "&[^&\s;]{2,8};" in the RegEx below represents entities.
108  $entityPattern = '/&[^&\\s;]{2,8};/';
109  preg_match_all($entityPattern, $fullTempContent, $matches);
110  $entityMatches = $matches[0];
111 
112  // If we have found any html entities, these should be counted as 1 character.
113  // Strategy is to replace all found entities with an arbitrary character ($)
114  // and use this new string to count offsets.
115  if (($entityMatches ?? []) !== []) {
116  $escapedContent = str_replace('$', ' ', $fullTempContent);
117  $replacedContent = preg_replace($entityPattern, '$', $escapedContent, -1, $count);
118  $croppedContent = !$cropFromRight ? mb_substr($replacedContent, 0, $cropPosition) : mb_substr($replacedContent, $numberOfChars, $cropPosition);
119 
120  // In case of negative offsets, we need to reverse everything.
121  // Because the string is cropped from behind, the entities
122  // have to be replaced in reverse, too.
123  if ($cropFromRight) {
124  $croppedContent = strrev($croppedContent);
125  $entityMatches = array_reverse($entityMatches);
126  }
127  foreach ($entityMatches as $entity) {
128  $croppedContent = preg_replace('/\$/', $entity, $croppedContent, 1);
129  }
130  $cropPosition = mb_strlen($croppedContent);
131  }
132 
133  // Main cropping. Note the +1 and -1. These are there to be able to
134  // check for space characters later on.
135  $fullTempContent = !$cropFromRight ? mb_substr($fullTempContent, 0, $cropPosition + 1) : mb_substr($fullTempContent, -$cropPosition - 1);
136 
137  // Crop to space means, we ensure to crop before (or after) a space.
138  // If there are no spaces, this option has no effect.
139  $cropToSpaceApplied = false;
140  if ($cropToSpace) {
141  $exploded = explode(' ', $fullTempContent);
142  if (count($exploded) > 1) {
143  if (!$cropFromRight && $exploded[count($exploded) - 1] !== ' ') {
144  array_pop($exploded);
145  $cropToSpaceApplied = true;
146  } elseif ($exploded[0] !== ' ') {
147  array_shift($exploded);
148  $cropToSpaceApplied = true;
149  }
150  }
151  $fullTempContent = implode(' ', $exploded);
152  }
153 
154  // Only remove the extra character again, if crop2space did not apply anything.
155  if (!$cropToSpaceApplied) {
156  $fullTempContent = !$cropFromRight ? mb_substr($fullTempContent, 0, -1) : mb_substr($fullTempContent, 1);
157  }
158 
159  $splittedContent[$offset] = $fullTempContent;
160  break;
161  }
162  $strLen += $thisStrLen;
163  }
164  }
165  // Close cropped tags.
166  $closingTags = [];
167  if ($croppedOffset !== null) {
168  $openingTagRegEx = '#^<(\\w+)(?:\\s|>)#';
169  $closingTagRegEx = '#^</(\\w+)(?:\\s|>)#';
170  for ($offset = $croppedOffset - 1; $offset >= 0; $offset = $offset - 2) {
171  if (substr($splittedContent[$offset], -2) === '/>') {
172  // Ignore empty element tags (e.g. <br />).
173  continue;
174  }
175  preg_match($numberOfChars < 0 ? $closingTagRegEx : $openingTagRegEx, $splittedContent[$offset], $matches);
176  $tagName = $matches[1] ?? null;
177  if ($tagName !== null) {
178  // Seek for the closing (or opening) tag.
179  $countSplittedContent = count($splittedContent);
180  for ($seekingOffset = $offset + 2; $seekingOffset < $countSplittedContent; $seekingOffset = $seekingOffset + 2) {
181  preg_match($numberOfChars < 0 ? $openingTagRegEx : $closingTagRegEx, $splittedContent[$seekingOffset], $matches);
182  $seekingTagName = $matches[1] ?? null;
183  if ($tagName === $seekingTagName) {
184  // We found a matching tag.
185  // Add closing tag only if it occurs after the cropped content item.
186  if ($seekingOffset > $croppedOffset) {
187  $closingTags[] = $splittedContent[$seekingOffset];
188  }
189  break;
190  }
191  }
192  }
193  }
194  // Drop the cropped items of the content array. The $closingTags will be added later on again.
195  array_splice($splittedContent, $croppedOffset + 1);
196  }
197  $splittedContent = array_merge($splittedContent, [
198  $croppedOffset !== null ? $replacementForEllipsis : '',
199  ], $closingTags);
200  // Reverse array once again if we are cropping from the end.
201  if ($numberOfChars < 0) {
202  $splittedContent = array_reverse($splittedContent);
203  }
204  return implode('', $splittedContent);
205  }
206 }
‪TYPO3\CMS\Core\Html
Definition: DefaultSanitizerBuilder.php:18
‪TYPO3\CMS\Core\Html\HtmlCropper
Definition: HtmlCropper.php:24
‪TYPO3\CMS\Core\Html\HtmlCropper\crop
‪string crop(string $content, int $numberOfChars, string $replacementForEllipsis, bool $cropToSpace)
Definition: HtmlCropper.php:37