‪TYPO3CMS  9.5
Punycode.php
Go to the documentation of this file.
1 <?php
2 
3 // {{{ license
4 
5 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
6 //
7 // +----------------------------------------------------------------------+
8 // | This library is free software; you can redistribute it and/or modify |
9 // | it under the terms of the GNU Lesser General Public License as |
10 // | published by the Free Software Foundation; either version 2.1 of the |
11 // | License, or (at your option) any later version. |
12 // | |
13 // | This library is distributed in the hope that it will be useful, but |
14 // | WITHOUT ANY WARRANTY; without even the implied warranty of |
15 // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 // | Lesser General Public License for more details. |
17 // | |
18 // | You should have received a copy of the GNU Lesser General Public |
19 // | License along with this library; if not, write to the Free Software |
20 // | Foundation, Inc., 51 Franklin St, Boston, MA 02110, United States |
21 // +----------------------------------------------------------------------+
22 //
23 // }}}
24 
25  /*
26  * @author Matthias Sommerfeld <mso@phlylabs.de>
27  * @copyright 2004-2016 phlyLabs Berlin, http://phlylabs.de
28  * @version 1.0.1 2016-01-24
29  */
30 
31 namespace ‪Mso\IdnaConvert;
32 
34 {
35  // Internal settings, do not touch!
36  const ‪punycodePrefix = 'xn--';
37  const ‪invalidUcs = 0x80000000;
38  const ‪maxUcs = 0x10FFFF;
39  const ‪base = 36;
40  const ‪tMin = 1;
41  const ‪tMax = 26;
42  const ‪skew = 38;
43  const ‪damp = 700;
44  const ‪initialBias = 72;
45  const ‪initialN = 0x80;
46  const ‪sBase = 0xAC00;
47  const ‪lBase = 0x1100;
48  const ‪vBase = 0x1161;
49  const ‪tBase = 0x11A7;
50  const ‪lCount = 19;
51  const ‪vCount = 21;
52  const ‪tCount = 28;
53  const ‪nCount = 588; // vCount * tCount
54  const ‪sCount = 11172; // lCount * tCount * vCount
55  const ‪sLast = self::sBase + self::lCount * self::vCount * ‪self::tCount;
56 
57  protected static ‪$isMbStringOverload = null;
58 
59  protected ‪$NamePrepData;
61 
70  {
71  // populate mbstring overloading cache if not set
72  if (self::$isMbStringOverload === null) {
73  self::$isMbStringOverload = (extension_loaded('mbstring') && (ini_get('mbstring.func_overload') & 0x02) === 0x02);
74  }
75 
78  }
79 
84  public function ‪getPunycodePrefix()
85  {
87  }
88 
94  public function ‪validate($encoded)
95  {
96  // Check for existence of the prefix
97  if (strpos($encoded, self::punycodePrefix) !== 0) {
98  return false;
99  }
100  // If nothing is left after the prefix, it is hopeless
101  if (strlen(trim($encoded)) <= strlen(self::punycodePrefix)) {
102  return false;
103  }
104  return true;
105  }
106 
112  public function ‪decode($encoded)
113  {
114  if (!$this->‪validate($encoded)) {
115  return false;
116  }
117 
118  $decoded = [];
119  // Find last occurence of the delimiter
120  $delim_pos = strrpos($encoded, '-');
121  if ($delim_pos > self::byteLength(self::punycodePrefix)) {
122  for ($k = self::byteLength(self::punycodePrefix); $k < $delim_pos; ++$k) {
123  $decoded[] = ord($encoded{$k});
124  }
125  }
126  $deco_len = count($decoded);
127  $enco_len = ‪self::byteLength($encoded);
128 
129  // Wandering through the strings; init
130  $is_first = true;
131  $bias = ‪self::initialBias;
132  $idx = 0;
133  $char = ‪self::initialN;
134 
135  for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
136  for ($old_idx = $idx, $w = 1, $k = self::base; 1; $k += ‪self::base) {
137  $digit = $this->‪decodeDigit($encoded{$enco_idx++});
138  $idx += $digit * $w;
139  $t = ($k <= $bias) ? self::tMin :
140  (($k >= $bias + self::tMax) ? ‪self::tMax : ($k - $bias));
141  if ($digit < $t) {
142  break;
143  }
144  $w = (int)($w * (self::base - $t));
145  }
146  $bias = $this->‪adapt($idx - $old_idx, $deco_len + 1, $is_first);
147  $is_first = false;
148  $char += (int)($idx / ($deco_len + 1));
149  $idx %= ($deco_len + 1);
150  if ($deco_len > 0) {
151  // Make room for the decoded char
152  for ($i = $deco_len; $i > $idx; $i--) {
153  $decoded[$i] = $decoded[($i - 1)];
154  }
155  }
156  $decoded[$idx++] = $char;
157  }
158  return $this->‪UnicodeTranscoder->‪ucs4array_utf8($decoded);
159  }
160 
166  public function ‪encode($decoded)
167  {
168  // We cannot encode a domain name containing the Punycode prefix
169  $extract = ‪self::byteLength(self::punycodePrefix);
170  $check_pref = $this->‪UnicodeTranscoder->‪utf8_ucs4array(self::punycodePrefix);
171  $check_deco = array_slice($decoded, 0, $extract);
172 
173  if ($check_pref == $check_deco) {
174  throw new \InvalidArgumentException('This is already a Punycode string');
175  }
176  // We will not try to encode strings consisting of basic code points only
177  $encodable = false;
178  foreach ($decoded as $k => $v) {
179  if ($v > 0x7a) {
180  $encodable = true;
181  break;
182  }
183  }
184  if (!$encodable) {
185  return false;
186  }
187  // Do NAMEPREP
188  $decoded = $this->‪namePrep($decoded);
189  if (!$decoded || !is_array($decoded)) {
190  return false; // NAMEPREP failed
191  }
192  $deco_len = count($decoded);
193  if (!$deco_len) {
194  return false; // Empty array
195  }
196  $codecount = 0; // How many chars have been consumed
197  $encoded = '';
198  // Copy all basic code points to output
199  for ($i = 0; $i < $deco_len; ++$i) {
200  $test = $decoded[$i];
201  // Will match [-0-9a-zA-Z]
202  if ((0x2F < $test && $test < 0x40)
203  || (0x40 < $test && $test < 0x5B)
204  || (0x60 < $test && $test <= 0x7B)
205  || (0x2D == $test)) {
206  $encoded .= chr($decoded[$i]);
207  $codecount++;
208  }
209  }
210  if ($codecount == $deco_len) {
211  return $encoded; // All codepoints were basic ones
212  }
213  // Start with the prefix; copy it to output
214  $encoded = self::punycodePrefix . $encoded;
215  // If we have basic code points in output, add an hyphen to the end
216  if ($codecount) {
217  $encoded .= '-';
218  }
219  // Now find and encode all non-basic code points
220  $is_first = true;
221  $cur_code = ‪self::initialN;
222  $bias = ‪self::initialBias;
223  $delta = 0;
224  while ($codecount < $deco_len) {
225  // Find the smallest code point >= the current code point and
226  // remember the last ouccrence of it in the input
227  for ($i = 0, $next_code = self::maxUcs; $i < $deco_len; $i++) {
228  if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
229  $next_code = $decoded[$i];
230  }
231  }
232  $delta += ($next_code - $cur_code) * ($codecount + 1);
233  $cur_code = $next_code;
234 
235  // Scan input again and encode all characters whose code point is $cur_code
236  for ($i = 0; $i < $deco_len; $i++) {
237  if ($decoded[$i] < $cur_code) {
238  $delta++;
239  } elseif ($decoded[$i] == $cur_code) {
240  for ($q = $delta, $k = self::base; 1; $k += ‪self::base) {
241  $t = ($k <= $bias)
242  ? self::tMin
243  : (($k >= $bias + self::tMax) ? ‪self::tMax : $k - $bias);
244  if ($q < $t) {
245  break;
246  }
247 
248  $encoded .= $this->‪encodeDigit(intval($t + (($q - $t) % (self::base - $t))));
249  $q = (int)(($q - $t) / (self::base - $t));
250  }
251  $encoded .= $this->‪encodeDigit($q);
252  $bias = $this->‪adapt($delta, $codecount + 1, $is_first);
253  $codecount++;
254  $delta = 0;
255  $is_first = false;
256  }
257  }
258  $delta++;
259  $cur_code++;
260  }
261  return $encoded;
262  }
263 
271  protected function ‪adapt($delta, $npoints, $is_first)
272  {
273  $delta = intval($is_first ? ($delta / self::damp) : ($delta / 2));
274  $delta += intval($delta / $npoints);
275  for ($k = 0; $delta > ((self::base - ‪self::tMin) * self::tMax) / 2; $k += ‪self::base) {
276  $delta = intval($delta / (self::base - self::tMin));
277  }
278  return intval($k + (self::base - self::tMin + 1) * $delta / ($delta + self::skew));
279  }
280 
286  protected function ‪encodeDigit($d)
287  {
288  return chr($d + 22 + 75 * ($d < 26));
289  }
290 
296  protected function ‪decodeDigit($cp)
297  {
298  $cp = ord($cp);
299  if ($cp - 48 < 10) {
300  return $cp - 22;
301  }
302 
303  if ($cp - 65 < 26) {
304  return $cp - 65;
305  }
306  if ($cp - 97 < 26) {
307  return $cp - 97;
308  }
309 
310  return ‪self::base;
311  }
312 
318  protected function ‪namePrep($input)
319  {
320  ‪$output = [];
321  //
322  // Mapping
323  // Walking through the input array, performing the required steps on each of
324  // the input chars and putting the result into the output array
325  // While mapping required chars we apply the canonical ordering
326  foreach ($input as $v) {
327  // Map to nothing == skip that code point
328  if (in_array($v, $this->‪NamePrepData->mapToNothing)) {
329  continue;
330  }
331  // Try to find prohibited input
332  if (in_array($v, $this->‪NamePrepData->prohibit) || in_array($v, $this->‪NamePrepData->generalProhibited)) {
333  throw new \InvalidArgumentException(sprintf('NAMEPREP: Prohibited input U+%08X', $v));
334  }
335  foreach ($this->‪NamePrepData->prohibitRanges as $range) {
336  if ($range[0] <= $v && $v <= $range[1]) {
337  throw new \InvalidArgumentException(sprintf('NAMEPREP: Prohibited input U+%08X', $v));
338  }
339  }
340 
341  if (0xAC00 <= $v && $v <= 0xD7AF) {
342  // Hangul syllable decomposition
343  foreach ($this->‪hangulDecompose($v) as $out) {
344  ‪$output[] = (int)$out;
345  }
346  } elseif (isset($this->‪NamePrepData->replaceMaps[$v])) {
347  foreach ($this->‪applyCanonicalOrdering($this->‪NamePrepData->replaceMaps[$v]) as $out) {
348  ‪$output[] = (int)$out;
349  }
350  } else {
351  ‪$output[] = (int)$v;
352  }
353  }
354  // Before applying any Combining, try to rearrange any Hangul syllables
356  //
357  // Combine code points
358  //
359  $last_class = 0;
360  $last_starter = 0;
361  $out_len = count(‪$output);
362  for ($i = 0; $i < $out_len; ++$i) {
363  $class = $this->‪getCombiningClass(‪$output[$i]);
364  if ((!$last_class || $last_class > $class) && $class) {
365  // Try to match
366  $seq_len = $i - $last_starter;
367  $out = $this->‪combine(array_slice(‪$output, $last_starter, $seq_len));
368  // On match: Replace the last starter with the composed character and remove
369  // the now redundant non-starter(s)
370  if ($out) {
371  ‪$output[$last_starter] = $out;
372  if (count($out) != $seq_len) {
373  for ($j = $i + 1; $j < $out_len; ++$j) {
374  ‪$output[$j - 1] = ‪$output[$j];
375  }
376  unset(‪$output[$out_len]);
377  }
378  // Rewind the for loop by one, since there can be more possible compositions
379  $i--;
380  $out_len--;
381  $last_class = ($i == $last_starter) ? 0 : $this->‪getCombiningClass(‪$output[$i - 1]);
382  continue;
383  }
384  }
385  // The current class is 0
386  if (!$class) {
387  $last_starter = $i;
388  }
389  $last_class = $class;
390  }
391  return ‪$output;
392  }
393 
400  protected function ‪hangulDecompose($char)
401  {
402  $sindex = (int)$char - self::sBase;
403  if ($sindex < 0 || $sindex >= self::sCount) {
404  return [$char];
405  }
406  $result = [];
407  $result[] = (int)self::lBase + $sindex / self::nCount;
408  $result[] = (int)self::vBase + ($sindex % self::nCount) / ‪self::tCount;
409  $T = intval(self::tBase + $sindex % self::tCount);
410  if ($T != self::tBase) {
411  $result[] = $T;
412  }
413  return $result;
414  }
415 
422  protected function ‪hangulCompose($input)
423  {
424  $inp_len = count($input);
425  if (!$inp_len) {
426  return [];
427  }
428  $result = [];
429  $last = (int)$input[0];
430  $result[] = $last; // copy first char from input to output
431 
432  for ($i = 1; $i < $inp_len; ++$i) {
433  $char = (int)$input[$i];
434  $sindex = $last - ‪self::sBase;
435  $lindex = $last - ‪self::lBase;
436  $vindex = $char - ‪self::vBase;
437  $tindex = $char - ‪self::tBase;
438  // Find out, whether two current characters are LV and T
439  if (0 <= $sindex && $sindex < self::sCount && ($sindex % self::tCount == 0) && 0 <= $tindex && $tindex <= ‪self::tCount) {
440  // create syllable of form LVT
441  $last += $tindex;
442  $result[(count($result) - 1)] = $last; // reset last
443  continue; // discard char
444  }
445  // Find out, whether two current characters form L and V
446  if (0 <= $lindex && $lindex < self::lCount && 0 <= $vindex && $vindex < self::vCount) {
447  // create syllable of form LV
448  $last = (int)self::sBase + ($lindex * self::vCount + $vindex) * ‪self::tCount;
449  $result[(count($result) - 1)] = $last; // reset last
450  continue; // discard char
451  }
452  // if neither case was true, just add the character
453  $last = $char;
454  $result[] = $char;
455  }
456  return $result;
457  }
458 
464  protected function ‪getCombiningClass($char)
465  {
466  return isset($this->‪NamePrepData->normalizeCombiningClasses[$char])
467  ? $this->‪NamePrepData->normalizeCombiningClasses[$char]
468  : 0;
469  }
470 
476  protected function ‪applyCanonicalOrdering($input)
477  {
478  $swap = true;
479  $size = count($input);
480  while ($swap) {
481  $swap = false;
482  $last = $this->‪getCombiningClass(intval($input[0]));
483  for ($i = 0; $i < $size - 1; ++$i) {
484  $next = $this->‪getCombiningClass(intval($input[$i + 1]));
485  if ($next != 0 && $last > $next) {
486  // Move item leftward until it fits
487  for ($j = $i + 1; $j > 0; --$j) {
488  if ($this->‪getCombiningClass(intval($input[$j - 1])) <= $next) {
489  break;
490  }
491  $t = intval($input[$j]);
492  $input[$j] = intval($input[$j - 1]);
493  $input[$j - 1] = $t;
494  $swap = true;
495  }
496  // Reentering the loop looking at the old character again
497  $next = $last;
498  }
499  $last = $next;
500  }
501  }
502  return $input;
503  }
504 
510  protected function ‪combine($input)
511  {
512  $inp_len = count($input);
513  if (0 == $inp_len) {
514  return false;
515  }
516  foreach ($this->‪NamePrepData->replaceMaps as $np_src => $np_target) {
517  if ($np_target[0] != $input[0]) {
518  continue;
519  }
520  if (count($np_target) != $inp_len) {
521  continue;
522  }
523  $hit = false;
524  foreach ($input as $k2 => $v2) {
525  if ($v2 == $np_target[$k2]) {
526  $hit = true;
527  } else {
528  $hit = false;
529  break;
530  }
531  }
532  if ($hit) {
533  return $np_src;
534  }
535  }
536  return false;
537  }
538 
546  protected static function ‪byteLength($string)
547  {
548  if (self::$isMbStringOverload) {
549  return mb_strlen($string, '8bit');
550  }
551  return strlen((binary)$string);
552  }
553 }
‪Mso\IdnaConvert\Punycode\__construct
‪__construct(NamePrepDataInterface $NamePrepData, UnicodeTranscoderInterface $UnicodeTranscoder)
Definition: Punycode.php:69
‪Mso\IdnaConvert\Punycode\byteLength
‪static int byteLength($string)
Definition: Punycode.php:546
‪Mso\IdnaConvert\Punycode\namePrep
‪string namePrep($input)
Definition: Punycode.php:318
‪Mso\IdnaConvert\Punycode\$NamePrepData
‪$NamePrepData
Definition: Punycode.php:59
‪Mso\IdnaConvert\Punycode\encodeDigit
‪string encodeDigit($d)
Definition: Punycode.php:286
‪Mso\IdnaConvert\Punycode\tBase
‪const tBase
Definition: Punycode.php:49
‪Mso\IdnaConvert\Punycode\tCount
‪const tCount
Definition: Punycode.php:52
‪Mso\IdnaConvert\Punycode\vCount
‪const vCount
Definition: Punycode.php:51
‪Mso\IdnaConvert\Punycode\nCount
‪const nCount
Definition: Punycode.php:53
‪Mso\IdnaConvert\Punycode\lBase
‪const lBase
Definition: Punycode.php:47
‪Mso\IdnaConvert\NamePrepDataInterface
Definition: NamePrepDataInterface.php:6
‪Mso\IdnaConvert\UnicodeTranscoder\utf8_ucs4array
‪static array utf8_ucs4array($input)
Definition: UnicodeTranscoder.php:73
‪Mso\IdnaConvert\Punycode\initialN
‪const initialN
Definition: Punycode.php:45
‪Mso\IdnaConvert\Punycode\damp
‪const damp
Definition: Punycode.php:43
‪Mso\IdnaConvert\Punycode\getPunycodePrefix
‪string getPunycodePrefix()
Definition: Punycode.php:84
‪Mso\IdnaConvert\Punycode\combine
‪array combine($input)
Definition: Punycode.php:510
‪Mso\IdnaConvert\UnicodeTranscoder\ucs4array_utf8
‪static string ucs4array_utf8($input)
Definition: UnicodeTranscoder.php:166
‪Mso\IdnaConvert\Punycode\sCount
‪const sCount
Definition: Punycode.php:54
‪Mso\IdnaConvert\Punycode\vBase
‪const vBase
Definition: Punycode.php:48
‪Mso\IdnaConvert\Punycode\getCombiningClass
‪int getCombiningClass($char)
Definition: Punycode.php:464
‪Mso\IdnaConvert\Punycode\adapt
‪int adapt($delta, $npoints, $is_first)
Definition: Punycode.php:271
‪Mso\IdnaConvert\Punycode\$UnicodeTranscoder
‪$UnicodeTranscoder
Definition: Punycode.php:60
‪Mso\IdnaConvert\Punycode\maxUcs
‪const maxUcs
Definition: Punycode.php:38
‪Mso\IdnaConvert\Punycode\tMin
‪const tMin
Definition: Punycode.php:40
‪Mso\IdnaConvert\Punycode\tMax
‪const tMax
Definition: Punycode.php:41
‪Mso\IdnaConvert\Punycode\applyCanonicalOrdering
‪array applyCanonicalOrdering($input)
Definition: Punycode.php:476
‪Mso\IdnaConvert\Punycode\hangulCompose
‪array hangulCompose($input)
Definition: Punycode.php:422
‪Mso\IdnaConvert\Punycode\hangulDecompose
‪array hangulDecompose($char)
Definition: Punycode.php:400
‪Mso\IdnaConvert
Definition: EncodingHelper.php:8
‪Mso\IdnaConvert\Punycode\encode
‪mixed encode($decoded)
Definition: Punycode.php:166
‪Mso\IdnaConvert\Punycode\lCount
‪const lCount
Definition: Punycode.php:50
‪Mso\IdnaConvert\Punycode\$isMbStringOverload
‪static $isMbStringOverload
Definition: Punycode.php:57
‪$output
‪$output
Definition: annotationChecker.php:113
‪Mso\IdnaConvert\Punycode\sLast
‪const sLast
Definition: Punycode.php:55
‪Mso\IdnaConvert\Punycode\decode
‪mixed decode($encoded)
Definition: Punycode.php:112
‪Mso\IdnaConvert\Punycode\punycodePrefix
‪const punycodePrefix
Definition: Punycode.php:36
‪Mso\IdnaConvert\Punycode\skew
‪const skew
Definition: Punycode.php:42
‪Mso\IdnaConvert\Punycode\base
‪const base
Definition: Punycode.php:39
‪Mso\IdnaConvert\Punycode\validate
‪bool validate($encoded)
Definition: Punycode.php:94
‪Mso\IdnaConvert\Punycode
Definition: Punycode.php:34
‪Mso\IdnaConvert\NamePrepData
Definition: NamePrepData.php:6
‪Mso\IdnaConvert\Punycode\sBase
‪const sBase
Definition: Punycode.php:46
‪Mso\IdnaConvert\Punycode\initialBias
‪const initialBias
Definition: Punycode.php:44
‪Mso\IdnaConvert\Punycode\invalidUcs
‪const invalidUcs
Definition: Punycode.php:37
‪Mso\IdnaConvert\PunycodeInterface
Definition: PunycodeInterface.php:10
‪Mso\IdnaConvert\Punycode\decodeDigit
‪int decodeDigit($cp)
Definition: Punycode.php:296
‪Mso\IdnaConvert\UnicodeTranscoderInterface
Definition: UnicodeTranscoderInterface.php:20
‪Mso\IdnaConvert\UnicodeTranscoder
Definition: UnicodeTranscoder.php:20