| 148 |
lars |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
/*
|
|
|
4 |
* This file is part of the Symfony package.
|
|
|
5 |
*
|
|
|
6 |
* (c) Fabien Potencier <fabien@symfony.com>
|
|
|
7 |
*
|
|
|
8 |
* For the full copyright and license information, please view the LICENSE
|
|
|
9 |
* file that was distributed with this source code.
|
|
|
10 |
*/
|
|
|
11 |
|
|
|
12 |
namespace Symfony\Component\String;
|
|
|
13 |
|
|
|
14 |
use Symfony\Component\String\Exception\ExceptionInterface;
|
|
|
15 |
use Symfony\Component\String\Exception\InvalidArgumentException;
|
|
|
16 |
use Symfony\Component\String\Exception\RuntimeException;
|
|
|
17 |
|
|
|
18 |
/**
|
|
|
19 |
* Represents a string of abstract Unicode characters.
|
|
|
20 |
*
|
|
|
21 |
* Unicode defines 3 types of "characters" (bytes, code points and grapheme clusters).
|
|
|
22 |
* This class is the abstract type to use as a type-hint when the logic you want to
|
|
|
23 |
* implement is Unicode-aware but doesn't care about code points vs grapheme clusters.
|
|
|
24 |
*
|
|
|
25 |
* @author Nicolas Grekas <p@tchwork.com>
|
|
|
26 |
*
|
|
|
27 |
* @throws ExceptionInterface
|
|
|
28 |
*/
|
|
|
29 |
abstract class AbstractUnicodeString extends AbstractString
|
|
|
30 |
{
|
|
|
31 |
public const NFC = \Normalizer::NFC;
|
|
|
32 |
public const NFD = \Normalizer::NFD;
|
|
|
33 |
public const NFKC = \Normalizer::NFKC;
|
|
|
34 |
public const NFKD = \Normalizer::NFKD;
|
|
|
35 |
|
|
|
36 |
// all ASCII letters sorted by typical frequency of occurrence
|
|
|
37 |
private const ASCII = "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F";
|
|
|
38 |
|
|
|
39 |
// the subset of folded case mappings that is not in lower case mappings
|
| 688 |
lars |
40 |
private const FOLD_FROM = ['İ', 'µ', 'Å¿', "\xCD\x85", 'Ï‚', 'Ï', 'Ï‘', 'Ï•', 'Ï–', 'ϰ', 'ϱ', 'ϵ', 'ẛ', "\xE1\xBE\xBE", 'ß', 'ʼn', 'ǰ', 'Î', 'ΰ', 'Ö‡', 'ẖ', 'ẗ', 'ẘ', 'ẙ', 'ẚ', 'ẞ', 'á½', 'á½’', 'á½”', 'á½–', 'á¾€', 'á¾', 'ᾂ', 'ᾃ', 'ᾄ', 'á¾…', 'ᾆ', 'ᾇ', 'ᾈ', 'ᾉ', 'ᾊ', 'ᾋ', 'ᾌ', 'á¾', 'ᾎ', 'á¾', 'á¾', 'ᾑ', 'á¾’', 'ᾓ', 'á¾”', 'ᾕ', 'á¾–', 'á¾—', 'ᾘ', 'á¾™', 'ᾚ', 'á¾›', 'ᾜ', 'á¾', 'ᾞ', 'ᾟ', 'á¾ ', 'ᾡ', 'á¾¢', 'á¾£', 'ᾤ', 'á¾¥', 'ᾦ', 'á¾§', 'ᾨ', 'ᾩ', 'ᾪ', 'ᾫ', 'ᾬ', 'á¾', 'á¾®', 'ᾯ', 'á¾²', 'á¾³', 'á¾´', 'á¾¶', 'á¾·', 'á¾¼', 'á¿‚', 'ῃ', 'á¿„', 'ῆ', 'ῇ', 'ῌ', 'á¿’', 'á¿–', 'á¿—', 'á¿¢', 'ῤ', 'ῦ', 'á¿§', 'ῲ', 'ῳ', 'á¿´', 'á¿¶', 'á¿·', 'ῼ', 'ff', 'ï¬', 'fl', 'ffi', 'ffl', 'ſt', 'st', 'ﬓ', 'ﬔ', 'ﬕ', 'ﬖ', 'ﬗ'];
|
|
|
41 |
private const FOLD_TO = ['i̇', 'μ', 's', 'ι', 'σ', 'β', 'θ', 'φ', 'Ï€', 'κ', 'Ï', 'ε', 'ṡ', 'ι', 'ss', 'ʼn', 'ǰ', 'Î', 'ΰ', 'Õ¥Ö‚', 'ẖ', 'ẗ', 'ẘ', 'ẙ', 'aʾ', 'ss', 'á½', 'á½’', 'á½”', 'á½–', 'ἀι', 'á¼Î¹', 'ἂι', 'ἃι', 'ἄι', 'ἅι', 'ἆι', 'ἇι', 'ἀι', 'á¼Î¹', 'ἂι', 'ἃι', 'ἄι', 'ἅι', 'ἆι', 'ἇι', 'ἠι', 'ἡι', 'ἢι', 'ἣι', 'ἤι', 'ἥι', 'ἦι', 'ἧι', 'ἠι', 'ἡι', 'ἢι', 'ἣι', 'ἤι', 'ἥι', 'ἦι', 'ἧι', 'ὠι', 'ὡι', 'ὢι', 'ὣι', 'ὤι', 'ὥι', 'ὦι', 'ὧι', 'ὠι', 'ὡι', 'ὢι', 'ὣι', 'ὤι', 'ὥι', 'ὦι', 'ὧι', 'ὰι', 'αι', 'άι', 'á¾¶', 'ᾶι', 'αι', 'ὴι', 'ηι', 'ήι', 'ῆ', 'ῆι', 'ηι', 'á¿’', 'á¿–', 'á¿—', 'á¿¢', 'ῤ', 'ῦ', 'á¿§', 'ὼι', 'ωι', 'ώι', 'á¿¶', 'ῶι', 'ωι', 'ff', 'fi', 'fl', 'ffi', 'ffl', 'st', 'st', 'Õ´Õ¶', 'Õ´Õ¥', 'Õ´Õ«', 'Õ¾Õ¶', 'Õ´Õ'];
|
| 148 |
lars |
42 |
|
|
|
43 |
// the subset of upper case mappings that map one code point to many code points
|
|
|
44 |
private const UPPER_FROM = ['ß', 'ff', 'ï¬', 'fl', 'ffi', 'ffl', 'ſt', 'st', 'Ö‡', 'ﬓ', 'ﬔ', 'ﬕ', 'ﬖ', 'ﬗ', 'ʼn', 'Î', 'ΰ', 'ǰ', 'ẖ', 'ẗ', 'ẘ', 'ẙ', 'ẚ', 'á½', 'á½’', 'á½”', 'á½–', 'á¾¶', 'ῆ', 'á¿’', 'á¿“', 'á¿–', 'á¿—', 'á¿¢', 'á¿£', 'ῤ', 'ῦ', 'á¿§', 'á¿¶'];
|
|
|
45 |
private const UPPER_TO = ['SS', 'FF', 'FI', 'FL', 'FFI', 'FFL', 'ST', 'ST', 'ÔµÕ’', 'Õ„Õ†', 'Õ„Ôµ', 'Õ„Ô»', 'ÕŽÕ†', 'Õ„Ô½', 'ʼN', 'ΪÌ', 'ΫÌ', 'JÌŒ', 'H̱', 'T̈', 'WÌŠ', 'YÌŠ', 'Aʾ', 'Υ̓', 'Υ̓̀', 'Υ̓Ì', 'Υ̓͂', 'Α͂', 'Η͂', 'Ϊ̀', 'ΪÌ', 'Ι͂', 'Ϊ͂', 'Ϋ̀', 'ΫÌ', 'Ρ̓', 'Υ͂', 'Ϋ͂', 'Ω͂'];
|
|
|
46 |
|
|
|
47 |
// the subset of https://github.com/unicode-org/cldr/blob/master/common/transforms/Latin-ASCII.xml that is not in NFKD
|
|
|
48 |
private const TRANSLIT_FROM = ['Æ', 'Ã', 'Ø', 'Þ', 'ß', 'æ', 'ð', 'ø', 'þ', 'Ä', 'Ä‘', 'Ħ', 'ħ', 'ı', 'ĸ', 'Ä¿', 'Å€', 'Å', 'Å‚', 'ʼn', 'ÅŠ', 'Å‹', 'Å’', 'Å“', 'Ŧ', 'ŧ', 'Æ€', 'Æ', 'Æ‚', 'ƃ', 'Ƈ', 'ƈ', 'Ɖ', 'ÆŠ', 'Æ‹', 'ÆŒ', 'Æ', 'Æ‘', 'Æ’', 'Æ“', 'Æ•', 'Æ–', 'Æ—', 'Ƙ', 'Æ™', 'Æš', 'Æ', 'Æž', 'Æ¢', 'Æ£', 'Ƥ', 'Æ¥', 'Æ«', 'Ƭ', 'Æ', 'Æ®', 'Ʋ', 'Ƴ', 'Æ´', 'Ƶ', 'ƶ', 'Ç„', 'Ç…', 'dž', 'Ǥ', 'Ç¥', 'È¡', 'Ȥ', 'È¥', 'È´', 'ȵ', 'ȶ', 'È·', 'ȸ', 'ȹ', 'Ⱥ', 'È»', 'ȼ', 'Ƚ', 'Ⱦ', 'È¿', 'É€', 'Ƀ', 'É„', 'Ɇ', 'ɇ', 'Ɉ', 'ɉ', 'ÉŒ', 'É', 'ÉŽ', 'É', 'É“', 'É•', 'É–', 'É—', 'É›', 'ÉŸ', 'É ', 'É¡', 'É¢', 'ɦ', 'ɧ', 'ɨ', 'ɪ', 'É«', 'ɬ', 'É', 'ɱ', 'ɲ', 'ɳ', 'É´', 'ɶ', 'ɼ', 'ɽ', 'ɾ', 'Ê€', 'Ê‚', 'ʈ', 'ʉ', 'Ê‹', 'Ê', 'Ê', 'Ê‘', 'Ê™', 'Ê›', 'Êœ', 'Ê', 'ÊŸ', 'Ê ', 'Ê£', 'Ê¥', 'ʦ', 'ʪ', 'Ê«', 'á´€', 'á´', 'á´ƒ', 'á´„', 'á´…', 'á´†', 'á´‡', 'á´Š', 'á´‹', 'á´Œ', 'á´', 'á´', 'á´˜', 'á´›', 'á´œ', 'á´ ', 'á´¡', 'á´¢', 'ᵫ', 'ᵬ', 'áµ', 'áµ®', 'ᵯ', 'áµ°', 'áµ±', 'áµ²', 'áµ³', 'áµ´', 'áµµ', 'áµ¶', 'ᵺ', 'áµ»', 'áµ½', 'áµ¾', 'á¶€', 'á¶', 'á¶‚', 'ᶃ', 'á¶„', 'á¶…', 'ᶆ', 'ᶇ', 'ᶈ', 'ᶉ', 'á¶Š', 'á¶Œ', 'á¶', 'á¶Ž', 'á¶', 'á¶‘', 'á¶’', 'á¶“', 'á¶–', 'á¶™', 'ẚ', 'ẜ', 'áº', 'ẞ', 'Ỻ', 'á»»', 'Ỽ', 'ỽ', 'Ỿ', 'ỿ', '©', '®', 'â‚ ', 'â‚¢', 'â‚£', '₤', 'â‚§', '₺', '₹', 'ℌ', '℞', '㎧', '㎮', 'ã†', 'ã—', 'ãž', 'ãŸ', '¼', '½', '¾', 'â…“', 'â…”', 'â…•', 'â…–', 'â…—', 'â…˜', 'â…™', 'â…š', 'â…›', 'â…œ', 'â…', 'â…ž', 'â…Ÿ', '〇', '‘', '’', '‚', '‛', '“', 'â€', '„', '‟', '′', '″', 'ã€', '〞', '«', '»', '‹', '›', 'â€', '‑', '‒', '–', '—', '―', '︱', '︲', '﹘', '‖', 'â„', 'â…', 'â†', 'âŽ', 'ã€', '。', '〈', '〉', '《', '》', '〔', '〕', '〘', '〙', '〚', '〛', '︑', '︒', '︹', '︺', '︽', '︾', '︿', 'ï¹€', '﹑', 'ï¹', '﹞', '⦅', 'ï½ ', '。', '、', '×', '÷', '−', '∕', '∖', '∣', '∥', '≪', '≫', '⦅', '⦆'];
|
|
|
49 |
private const TRANSLIT_TO = ['AE', 'D', 'O', 'TH', 'ss', 'ae', 'd', 'o', 'th', 'D', 'd', 'H', 'h', 'i', 'q', 'L', 'l', 'L', 'l', '\'n', 'N', 'n', 'OE', 'oe', 'T', 't', 'b', 'B', 'B', 'b', 'C', 'c', 'D', 'D', 'D', 'd', 'E', 'F', 'f', 'G', 'hv', 'I', 'I', 'K', 'k', 'l', 'N', 'n', 'OI', 'oi', 'P', 'p', 't', 'T', 't', 'T', 'V', 'Y', 'y', 'Z', 'z', 'DZ', 'Dz', 'dz', 'G', 'g', 'd', 'Z', 'z', 'l', 'n', 't', 'j', 'db', 'qp', 'A', 'C', 'c', 'L', 'T', 's', 'z', 'B', 'U', 'E', 'e', 'J', 'j', 'R', 'r', 'Y', 'y', 'b', 'c', 'd', 'd', 'e', 'j', 'g', 'g', 'G', 'h', 'h', 'i', 'I', 'l', 'l', 'l', 'm', 'n', 'n', 'N', 'OE', 'r', 'r', 'r', 'R', 's', 't', 'u', 'v', 'Y', 'z', 'z', 'B', 'G', 'H', 'j', 'L', 'q', 'dz', 'dz', 'ts', 'ls', 'lz', 'A', 'AE', 'B', 'C', 'D', 'D', 'E', 'J', 'K', 'L', 'M', 'O', 'P', 'T', 'U', 'V', 'W', 'Z', 'ue', 'b', 'd', 'f', 'm', 'n', 'p', 'r', 'r', 's', 't', 'z', 'th', 'I', 'p', 'U', 'b', 'd', 'f', 'g', 'k', 'l', 'm', 'n', 'p', 'r', 's', 'v', 'x', 'z', 'a', 'd', 'e', 'e', 'i', 'u', 'a', 's', 's', 'SS', 'LL', 'll', 'V', 'v', 'Y', 'y', '(C)', '(R)', 'CE', 'Cr', 'Fr.', 'L.', 'Pts', 'TL', 'Rs', 'x', 'Rx', 'm/s', 'rad/s', 'C/kg', 'pH', 'V/m', 'A/m', ' 1/4', ' 1/2', ' 3/4', ' 1/3', ' 2/3', ' 1/5', ' 2/5', ' 3/5', ' 4/5', ' 1/6', ' 5/6', ' 1/8', ' 3/8', ' 5/8', ' 7/8', ' 1/', '0', '\'', '\'', ',', '\'', '"', '"', ',,', '"', '\'', '"', '"', '"', '<<', '>>', '<', '>', '-', '-', '-', '-', '-', '-', '-', '-', '-', '||', '/', '[', ']', '*', ',', '.', '<', '>', '<<', '>>', '[', ']', '[', ']', '[', ']', ',', '.', '[', ']', '<<', '>>', '<', '>', ',', '[', ']', '((', '))', '.', ',', '*', '/', '-', '/', '\\', '|', '||', '<<', '>>', '((', '))'];
|
|
|
50 |
|
|
|
51 |
private static $transliterators = [];
|
|
|
52 |
private static $tableZero;
|
|
|
53 |
private static $tableWide;
|
|
|
54 |
|
|
|
55 |
public static function fromCodePoints(int ...$codes): static
|
|
|
56 |
{
|
|
|
57 |
$string = '';
|
|
|
58 |
|
|
|
59 |
foreach ($codes as $code) {
|
|
|
60 |
if (0x80 > $code %= 0x200000) {
|
|
|
61 |
$string .= \chr($code);
|
|
|
62 |
} elseif (0x800 > $code) {
|
|
|
63 |
$string .= \chr(0xC0 | $code >> 6).\chr(0x80 | $code & 0x3F);
|
|
|
64 |
} elseif (0x10000 > $code) {
|
|
|
65 |
$string .= \chr(0xE0 | $code >> 12).\chr(0x80 | $code >> 6 & 0x3F).\chr(0x80 | $code & 0x3F);
|
|
|
66 |
} else {
|
|
|
67 |
$string .= \chr(0xF0 | $code >> 18).\chr(0x80 | $code >> 12 & 0x3F).\chr(0x80 | $code >> 6 & 0x3F).\chr(0x80 | $code & 0x3F);
|
|
|
68 |
}
|
|
|
69 |
}
|
|
|
70 |
|
|
|
71 |
return new static($string);
|
|
|
72 |
}
|
|
|
73 |
|
|
|
74 |
/**
|
|
|
75 |
* Generic UTF-8 to ASCII transliteration.
|
|
|
76 |
*
|
|
|
77 |
* Install the intl extension for best results.
|
|
|
78 |
*
|
|
|
79 |
* @param string[]|\Transliterator[]|\Closure[] $rules See "*-Latin" rules from Transliterator::listIDs()
|
|
|
80 |
*/
|
|
|
81 |
public function ascii(array $rules = []): self
|
|
|
82 |
{
|
|
|
83 |
$str = clone $this;
|
|
|
84 |
$s = $str->string;
|
|
|
85 |
$str->string = '';
|
|
|
86 |
|
|
|
87 |
array_unshift($rules, 'nfd');
|
|
|
88 |
$rules[] = 'latin-ascii';
|
|
|
89 |
|
|
|
90 |
if (\function_exists('transliterator_transliterate')) {
|
|
|
91 |
$rules[] = 'any-latin/bgn';
|
|
|
92 |
}
|
|
|
93 |
|
|
|
94 |
$rules[] = 'nfkd';
|
|
|
95 |
$rules[] = '[:nonspacing mark:] remove';
|
|
|
96 |
|
|
|
97 |
while (\strlen($s) - 1 > $i = strspn($s, self::ASCII)) {
|
|
|
98 |
if (0 < --$i) {
|
|
|
99 |
$str->string .= substr($s, 0, $i);
|
|
|
100 |
$s = substr($s, $i);
|
|
|
101 |
}
|
|
|
102 |
|
|
|
103 |
if (!$rule = array_shift($rules)) {
|
|
|
104 |
$rules = []; // An empty rule interrupts the next ones
|
|
|
105 |
}
|
|
|
106 |
|
|
|
107 |
if ($rule instanceof \Transliterator) {
|
|
|
108 |
$s = $rule->transliterate($s);
|
|
|
109 |
} elseif ($rule instanceof \Closure) {
|
|
|
110 |
$s = $rule($s);
|
|
|
111 |
} elseif ($rule) {
|
|
|
112 |
if ('nfd' === $rule = strtolower($rule)) {
|
|
|
113 |
normalizer_is_normalized($s, self::NFD) ?: $s = normalizer_normalize($s, self::NFD);
|
|
|
114 |
} elseif ('nfkd' === $rule) {
|
|
|
115 |
normalizer_is_normalized($s, self::NFKD) ?: $s = normalizer_normalize($s, self::NFKD);
|
|
|
116 |
} elseif ('[:nonspacing mark:] remove' === $rule) {
|
|
|
117 |
$s = preg_replace('/\p{Mn}++/u', '', $s);
|
|
|
118 |
} elseif ('latin-ascii' === $rule) {
|
|
|
119 |
$s = str_replace(self::TRANSLIT_FROM, self::TRANSLIT_TO, $s);
|
|
|
120 |
} elseif ('de-ascii' === $rule) {
|
|
|
121 |
$s = preg_replace("/([AUO])\u{0308}(?=\p{Ll})/u", '$1e', $s);
|
|
|
122 |
$s = str_replace(["a\u{0308}", "o\u{0308}", "u\u{0308}", "A\u{0308}", "O\u{0308}", "U\u{0308}"], ['ae', 'oe', 'ue', 'AE', 'OE', 'UE'], $s);
|
|
|
123 |
} elseif (\function_exists('transliterator_transliterate')) {
|
|
|
124 |
if (null === $transliterator = self::$transliterators[$rule] ??= \Transliterator::create($rule)) {
|
|
|
125 |
if ('any-latin/bgn' === $rule) {
|
|
|
126 |
$rule = 'any-latin';
|
|
|
127 |
$transliterator = self::$transliterators[$rule] ??= \Transliterator::create($rule);
|
|
|
128 |
}
|
|
|
129 |
|
|
|
130 |
if (null === $transliterator) {
|
|
|
131 |
throw new InvalidArgumentException(sprintf('Unknown transliteration rule "%s".', $rule));
|
|
|
132 |
}
|
|
|
133 |
|
|
|
134 |
self::$transliterators['any-latin/bgn'] = $transliterator;
|
|
|
135 |
}
|
|
|
136 |
|
|
|
137 |
$s = $transliterator->transliterate($s);
|
|
|
138 |
}
|
|
|
139 |
} elseif (!\function_exists('iconv')) {
|
|
|
140 |
$s = preg_replace('/[^\x00-\x7F]/u', '?', $s);
|
|
|
141 |
} else {
|
|
|
142 |
$s = @preg_replace_callback('/[^\x00-\x7F]/u', static function ($c) {
|
|
|
143 |
$c = (string) iconv('UTF-8', 'ASCII//TRANSLIT', $c[0]);
|
|
|
144 |
|
|
|
145 |
if ('' === $c && '' === iconv('UTF-8', 'ASCII//TRANSLIT', '²')) {
|
|
|
146 |
throw new \LogicException(sprintf('"%s" requires a translit-able iconv implementation, try installing "gnu-libiconv" if you\'re using Alpine Linux.', static::class));
|
|
|
147 |
}
|
|
|
148 |
|
|
|
149 |
return 1 < \strlen($c) ? ltrim($c, '\'`"^~') : ('' !== $c ? $c : '?');
|
|
|
150 |
}, $s);
|
|
|
151 |
}
|
|
|
152 |
}
|
|
|
153 |
|
|
|
154 |
$str->string .= $s;
|
|
|
155 |
|
|
|
156 |
return $str;
|
|
|
157 |
}
|
|
|
158 |
|
|
|
159 |
public function camel(): static
|
|
|
160 |
{
|
|
|
161 |
$str = clone $this;
|
|
|
162 |
$str->string = str_replace(' ', '', preg_replace_callback('/\b.(?![A-Z]{2,})/u', static function ($m) use (&$i) {
|
|
|
163 |
return 1 === ++$i ? ('İ' === $m[0] ? 'i̇' : mb_strtolower($m[0], 'UTF-8')) : mb_convert_case($m[0], \MB_CASE_TITLE, 'UTF-8');
|
|
|
164 |
}, preg_replace('/[^\pL0-9]++/u', ' ', $this->string)));
|
|
|
165 |
|
|
|
166 |
return $str;
|
|
|
167 |
}
|
|
|
168 |
|
|
|
169 |
/**
|
|
|
170 |
* @return int[]
|
|
|
171 |
*/
|
|
|
172 |
public function codePointsAt(int $offset): array
|
|
|
173 |
{
|
|
|
174 |
$str = $this->slice($offset, 1);
|
|
|
175 |
|
|
|
176 |
if ('' === $str->string) {
|
|
|
177 |
return [];
|
|
|
178 |
}
|
|
|
179 |
|
|
|
180 |
$codePoints = [];
|
|
|
181 |
|
|
|
182 |
foreach (preg_split('//u', $str->string, -1, \PREG_SPLIT_NO_EMPTY) as $c) {
|
|
|
183 |
$codePoints[] = mb_ord($c, 'UTF-8');
|
|
|
184 |
}
|
|
|
185 |
|
|
|
186 |
return $codePoints;
|
|
|
187 |
}
|
|
|
188 |
|
|
|
189 |
public function folded(bool $compat = true): static
|
|
|
190 |
{
|
|
|
191 |
$str = clone $this;
|
|
|
192 |
|
|
|
193 |
if (!$compat || !\defined('Normalizer::NFKC_CF')) {
|
|
|
194 |
$str->string = normalizer_normalize($str->string, $compat ? \Normalizer::NFKC : \Normalizer::NFC);
|
|
|
195 |
$str->string = mb_strtolower(str_replace(self::FOLD_FROM, self::FOLD_TO, $this->string), 'UTF-8');
|
|
|
196 |
} else {
|
|
|
197 |
$str->string = normalizer_normalize($str->string, \Normalizer::NFKC_CF);
|
|
|
198 |
}
|
|
|
199 |
|
|
|
200 |
return $str;
|
|
|
201 |
}
|
|
|
202 |
|
|
|
203 |
public function join(array $strings, string $lastGlue = null): static
|
|
|
204 |
{
|
|
|
205 |
$str = clone $this;
|
|
|
206 |
|
|
|
207 |
$tail = null !== $lastGlue && 1 < \count($strings) ? $lastGlue.array_pop($strings) : '';
|
|
|
208 |
$str->string = implode($this->string, $strings).$tail;
|
|
|
209 |
|
|
|
210 |
if (!preg_match('//u', $str->string)) {
|
|
|
211 |
throw new InvalidArgumentException('Invalid UTF-8 string.');
|
|
|
212 |
}
|
|
|
213 |
|
|
|
214 |
return $str;
|
|
|
215 |
}
|
|
|
216 |
|
|
|
217 |
public function lower(): static
|
|
|
218 |
{
|
|
|
219 |
$str = clone $this;
|
|
|
220 |
$str->string = mb_strtolower(str_replace('İ', 'i̇', $str->string), 'UTF-8');
|
|
|
221 |
|
|
|
222 |
return $str;
|
|
|
223 |
}
|
|
|
224 |
|
|
|
225 |
public function match(string $regexp, int $flags = 0, int $offset = 0): array
|
|
|
226 |
{
|
|
|
227 |
$match = ((\PREG_PATTERN_ORDER | \PREG_SET_ORDER) & $flags) ? 'preg_match_all' : 'preg_match';
|
|
|
228 |
|
|
|
229 |
if ($this->ignoreCase) {
|
|
|
230 |
$regexp .= 'i';
|
|
|
231 |
}
|
|
|
232 |
|
|
|
233 |
set_error_handler(static function ($t, $m) { throw new InvalidArgumentException($m); });
|
|
|
234 |
|
|
|
235 |
try {
|
|
|
236 |
if (false === $match($regexp.'u', $this->string, $matches, $flags | \PREG_UNMATCHED_AS_NULL, $offset)) {
|
|
|
237 |
throw new RuntimeException('Matching failed with error: '.preg_last_error_msg());
|
|
|
238 |
}
|
|
|
239 |
} finally {
|
|
|
240 |
restore_error_handler();
|
|
|
241 |
}
|
|
|
242 |
|
|
|
243 |
return $matches;
|
|
|
244 |
}
|
|
|
245 |
|
|
|
246 |
public function normalize(int $form = self::NFC): static
|
|
|
247 |
{
|
|
|
248 |
if (!\in_array($form, [self::NFC, self::NFD, self::NFKC, self::NFKD])) {
|
|
|
249 |
throw new InvalidArgumentException('Unsupported normalization form.');
|
|
|
250 |
}
|
|
|
251 |
|
|
|
252 |
$str = clone $this;
|
|
|
253 |
normalizer_is_normalized($str->string, $form) ?: $str->string = normalizer_normalize($str->string, $form);
|
|
|
254 |
|
|
|
255 |
return $str;
|
|
|
256 |
}
|
|
|
257 |
|
|
|
258 |
public function padBoth(int $length, string $padStr = ' '): static
|
|
|
259 |
{
|
|
|
260 |
if ('' === $padStr || !preg_match('//u', $padStr)) {
|
|
|
261 |
throw new InvalidArgumentException('Invalid UTF-8 string.');
|
|
|
262 |
}
|
|
|
263 |
|
|
|
264 |
$pad = clone $this;
|
|
|
265 |
$pad->string = $padStr;
|
|
|
266 |
|
|
|
267 |
return $this->pad($length, $pad, \STR_PAD_BOTH);
|
|
|
268 |
}
|
|
|
269 |
|
|
|
270 |
public function padEnd(int $length, string $padStr = ' '): static
|
|
|
271 |
{
|
|
|
272 |
if ('' === $padStr || !preg_match('//u', $padStr)) {
|
|
|
273 |
throw new InvalidArgumentException('Invalid UTF-8 string.');
|
|
|
274 |
}
|
|
|
275 |
|
|
|
276 |
$pad = clone $this;
|
|
|
277 |
$pad->string = $padStr;
|
|
|
278 |
|
|
|
279 |
return $this->pad($length, $pad, \STR_PAD_RIGHT);
|
|
|
280 |
}
|
|
|
281 |
|
|
|
282 |
public function padStart(int $length, string $padStr = ' '): static
|
|
|
283 |
{
|
|
|
284 |
if ('' === $padStr || !preg_match('//u', $padStr)) {
|
|
|
285 |
throw new InvalidArgumentException('Invalid UTF-8 string.');
|
|
|
286 |
}
|
|
|
287 |
|
|
|
288 |
$pad = clone $this;
|
|
|
289 |
$pad->string = $padStr;
|
|
|
290 |
|
|
|
291 |
return $this->pad($length, $pad, \STR_PAD_LEFT);
|
|
|
292 |
}
|
|
|
293 |
|
|
|
294 |
public function replaceMatches(string $fromRegexp, string|callable $to): static
|
|
|
295 |
{
|
|
|
296 |
if ($this->ignoreCase) {
|
|
|
297 |
$fromRegexp .= 'i';
|
|
|
298 |
}
|
|
|
299 |
|
|
|
300 |
if (\is_array($to) || $to instanceof \Closure) {
|
|
|
301 |
$replace = 'preg_replace_callback';
|
|
|
302 |
$to = static function (array $m) use ($to): string {
|
|
|
303 |
$to = $to($m);
|
|
|
304 |
|
|
|
305 |
if ('' !== $to && (!\is_string($to) || !preg_match('//u', $to))) {
|
|
|
306 |
throw new InvalidArgumentException('Replace callback must return a valid UTF-8 string.');
|
|
|
307 |
}
|
|
|
308 |
|
|
|
309 |
return $to;
|
|
|
310 |
};
|
|
|
311 |
} elseif ('' !== $to && !preg_match('//u', $to)) {
|
|
|
312 |
throw new InvalidArgumentException('Invalid UTF-8 string.');
|
|
|
313 |
} else {
|
|
|
314 |
$replace = 'preg_replace';
|
|
|
315 |
}
|
|
|
316 |
|
|
|
317 |
set_error_handler(static function ($t, $m) { throw new InvalidArgumentException($m); });
|
|
|
318 |
|
|
|
319 |
try {
|
|
|
320 |
if (null === $string = $replace($fromRegexp.'u', $to, $this->string)) {
|
|
|
321 |
$lastError = preg_last_error();
|
|
|
322 |
|
|
|
323 |
foreach (get_defined_constants(true)['pcre'] as $k => $v) {
|
|
|
324 |
if ($lastError === $v && str_ends_with($k, '_ERROR')) {
|
|
|
325 |
throw new RuntimeException('Matching failed with '.$k.'.');
|
|
|
326 |
}
|
|
|
327 |
}
|
|
|
328 |
|
|
|
329 |
throw new RuntimeException('Matching failed with unknown error code.');
|
|
|
330 |
}
|
|
|
331 |
} finally {
|
|
|
332 |
restore_error_handler();
|
|
|
333 |
}
|
|
|
334 |
|
|
|
335 |
$str = clone $this;
|
|
|
336 |
$str->string = $string;
|
|
|
337 |
|
|
|
338 |
return $str;
|
|
|
339 |
}
|
|
|
340 |
|
|
|
341 |
public function reverse(): static
|
|
|
342 |
{
|
|
|
343 |
$str = clone $this;
|
|
|
344 |
$str->string = implode('', array_reverse(preg_split('/(\X)/u', $str->string, -1, \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY)));
|
|
|
345 |
|
|
|
346 |
return $str;
|
|
|
347 |
}
|
|
|
348 |
|
|
|
349 |
public function snake(): static
|
|
|
350 |
{
|
|
|
351 |
$str = $this->camel();
|
|
|
352 |
$str->string = mb_strtolower(preg_replace(['/(\p{Lu}+)(\p{Lu}\p{Ll})/u', '/([\p{Ll}0-9])(\p{Lu})/u'], '\1_\2', $str->string), 'UTF-8');
|
|
|
353 |
|
|
|
354 |
return $str;
|
|
|
355 |
}
|
|
|
356 |
|
|
|
357 |
public function title(bool $allWords = false): static
|
|
|
358 |
{
|
|
|
359 |
$str = clone $this;
|
|
|
360 |
|
|
|
361 |
$limit = $allWords ? -1 : 1;
|
|
|
362 |
|
|
|
363 |
$str->string = preg_replace_callback('/\b./u', static function (array $m): string {
|
|
|
364 |
return mb_convert_case($m[0], \MB_CASE_TITLE, 'UTF-8');
|
|
|
365 |
}, $str->string, $limit);
|
|
|
366 |
|
|
|
367 |
return $str;
|
|
|
368 |
}
|
|
|
369 |
|
|
|
370 |
public function trim(string $chars = " \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}"): static
|
|
|
371 |
{
|
|
|
372 |
if (" \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}" !== $chars && !preg_match('//u', $chars)) {
|
|
|
373 |
throw new InvalidArgumentException('Invalid UTF-8 chars.');
|
|
|
374 |
}
|
|
|
375 |
$chars = preg_quote($chars);
|
|
|
376 |
|
|
|
377 |
$str = clone $this;
|
|
|
378 |
$str->string = preg_replace("{^[$chars]++|[$chars]++$}uD", '', $str->string);
|
|
|
379 |
|
|
|
380 |
return $str;
|
|
|
381 |
}
|
|
|
382 |
|
|
|
383 |
public function trimEnd(string $chars = " \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}"): static
|
|
|
384 |
{
|
|
|
385 |
if (" \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}" !== $chars && !preg_match('//u', $chars)) {
|
|
|
386 |
throw new InvalidArgumentException('Invalid UTF-8 chars.');
|
|
|
387 |
}
|
|
|
388 |
$chars = preg_quote($chars);
|
|
|
389 |
|
|
|
390 |
$str = clone $this;
|
|
|
391 |
$str->string = preg_replace("{[$chars]++$}uD", '', $str->string);
|
|
|
392 |
|
|
|
393 |
return $str;
|
|
|
394 |
}
|
|
|
395 |
|
|
|
396 |
public function trimPrefix($prefix): static
|
|
|
397 |
{
|
|
|
398 |
if (!$this->ignoreCase) {
|
|
|
399 |
return parent::trimPrefix($prefix);
|
|
|
400 |
}
|
|
|
401 |
|
|
|
402 |
$str = clone $this;
|
|
|
403 |
|
|
|
404 |
if ($prefix instanceof \Traversable) {
|
|
|
405 |
$prefix = iterator_to_array($prefix, false);
|
|
|
406 |
} elseif ($prefix instanceof parent) {
|
|
|
407 |
$prefix = $prefix->string;
|
|
|
408 |
}
|
|
|
409 |
|
|
|
410 |
$prefix = implode('|', array_map('preg_quote', (array) $prefix));
|
|
|
411 |
$str->string = preg_replace("{^(?:$prefix)}iuD", '', $this->string);
|
|
|
412 |
|
|
|
413 |
return $str;
|
|
|
414 |
}
|
|
|
415 |
|
|
|
416 |
public function trimStart(string $chars = " \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}"): static
|
|
|
417 |
{
|
|
|
418 |
if (" \t\n\r\0\x0B\x0C\u{A0}\u{FEFF}" !== $chars && !preg_match('//u', $chars)) {
|
|
|
419 |
throw new InvalidArgumentException('Invalid UTF-8 chars.');
|
|
|
420 |
}
|
|
|
421 |
$chars = preg_quote($chars);
|
|
|
422 |
|
|
|
423 |
$str = clone $this;
|
|
|
424 |
$str->string = preg_replace("{^[$chars]++}uD", '', $str->string);
|
|
|
425 |
|
|
|
426 |
return $str;
|
|
|
427 |
}
|
|
|
428 |
|
|
|
429 |
public function trimSuffix($suffix): static
|
|
|
430 |
{
|
|
|
431 |
if (!$this->ignoreCase) {
|
|
|
432 |
return parent::trimSuffix($suffix);
|
|
|
433 |
}
|
|
|
434 |
|
|
|
435 |
$str = clone $this;
|
|
|
436 |
|
|
|
437 |
if ($suffix instanceof \Traversable) {
|
|
|
438 |
$suffix = iterator_to_array($suffix, false);
|
|
|
439 |
} elseif ($suffix instanceof parent) {
|
|
|
440 |
$suffix = $suffix->string;
|
|
|
441 |
}
|
|
|
442 |
|
|
|
443 |
$suffix = implode('|', array_map('preg_quote', (array) $suffix));
|
|
|
444 |
$str->string = preg_replace("{(?:$suffix)$}iuD", '', $this->string);
|
|
|
445 |
|
|
|
446 |
return $str;
|
|
|
447 |
}
|
|
|
448 |
|
|
|
449 |
public function upper(): static
|
|
|
450 |
{
|
|
|
451 |
$str = clone $this;
|
|
|
452 |
$str->string = mb_strtoupper($str->string, 'UTF-8');
|
|
|
453 |
|
|
|
454 |
return $str;
|
|
|
455 |
}
|
|
|
456 |
|
|
|
457 |
public function width(bool $ignoreAnsiDecoration = true): int
|
|
|
458 |
{
|
|
|
459 |
$width = 0;
|
|
|
460 |
$s = str_replace(["\x00", "\x05", "\x07"], '', $this->string);
|
|
|
461 |
|
|
|
462 |
if (str_contains($s, "\r")) {
|
|
|
463 |
$s = str_replace(["\r\n", "\r"], "\n", $s);
|
|
|
464 |
}
|
|
|
465 |
|
|
|
466 |
if (!$ignoreAnsiDecoration) {
|
|
|
467 |
$s = preg_replace('/[\p{Cc}\x7F]++/u', '', $s);
|
|
|
468 |
}
|
|
|
469 |
|
|
|
470 |
foreach (explode("\n", $s) as $s) {
|
|
|
471 |
if ($ignoreAnsiDecoration) {
|
|
|
472 |
$s = preg_replace('/(?:\x1B(?:
|
|
|
473 |
\[ [\x30-\x3F]*+ [\x20-\x2F]*+ [\x40-\x7E]
|
|
|
474 |
| [P\]X^_] .*? \x1B\\\\
|
|
|
475 |
| [\x41-\x7E]
|
|
|
476 |
)|[\p{Cc}\x7F]++)/xu', '', $s);
|
|
|
477 |
}
|
|
|
478 |
|
|
|
479 |
$lineWidth = $this->wcswidth($s);
|
|
|
480 |
|
|
|
481 |
if ($lineWidth > $width) {
|
|
|
482 |
$width = $lineWidth;
|
|
|
483 |
}
|
|
|
484 |
}
|
|
|
485 |
|
|
|
486 |
return $width;
|
|
|
487 |
}
|
|
|
488 |
|
|
|
489 |
private function pad(int $len, self $pad, int $type): static
|
|
|
490 |
{
|
|
|
491 |
$sLen = $this->length();
|
|
|
492 |
|
|
|
493 |
if ($len <= $sLen) {
|
|
|
494 |
return clone $this;
|
|
|
495 |
}
|
|
|
496 |
|
|
|
497 |
$padLen = $pad->length();
|
|
|
498 |
$freeLen = $len - $sLen;
|
|
|
499 |
$len = $freeLen % $padLen;
|
|
|
500 |
|
|
|
501 |
switch ($type) {
|
|
|
502 |
case \STR_PAD_RIGHT:
|
|
|
503 |
return $this->append(str_repeat($pad->string, intdiv($freeLen, $padLen)).($len ? $pad->slice(0, $len) : ''));
|
|
|
504 |
|
|
|
505 |
case \STR_PAD_LEFT:
|
|
|
506 |
return $this->prepend(str_repeat($pad->string, intdiv($freeLen, $padLen)).($len ? $pad->slice(0, $len) : ''));
|
|
|
507 |
|
|
|
508 |
case \STR_PAD_BOTH:
|
|
|
509 |
$freeLen /= 2;
|
|
|
510 |
|
|
|
511 |
$rightLen = ceil($freeLen);
|
|
|
512 |
$len = $rightLen % $padLen;
|
|
|
513 |
$str = $this->append(str_repeat($pad->string, intdiv($rightLen, $padLen)).($len ? $pad->slice(0, $len) : ''));
|
|
|
514 |
|
|
|
515 |
$leftLen = floor($freeLen);
|
|
|
516 |
$len = $leftLen % $padLen;
|
|
|
517 |
|
|
|
518 |
return $str->prepend(str_repeat($pad->string, intdiv($leftLen, $padLen)).($len ? $pad->slice(0, $len) : ''));
|
|
|
519 |
|
|
|
520 |
default:
|
|
|
521 |
throw new InvalidArgumentException('Invalid padding type.');
|
|
|
522 |
}
|
|
|
523 |
}
|
|
|
524 |
|
|
|
525 |
/**
|
|
|
526 |
* Based on https://github.com/jquast/wcwidth, a Python implementation of https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c.
|
|
|
527 |
*/
|
|
|
528 |
private function wcswidth(string $string): int
|
|
|
529 |
{
|
|
|
530 |
$width = 0;
|
|
|
531 |
|
|
|
532 |
foreach (preg_split('//u', $string, -1, \PREG_SPLIT_NO_EMPTY) as $c) {
|
|
|
533 |
$codePoint = mb_ord($c, 'UTF-8');
|
|
|
534 |
|
|
|
535 |
if (0 === $codePoint // NULL
|
|
|
536 |
|| 0x034F === $codePoint // COMBINING GRAPHEME JOINER
|
|
|
537 |
|| (0x200B <= $codePoint && 0x200F >= $codePoint) // ZERO WIDTH SPACE to RIGHT-TO-LEFT MARK
|
|
|
538 |
|| 0x2028 === $codePoint // LINE SEPARATOR
|
|
|
539 |
|| 0x2029 === $codePoint // PARAGRAPH SEPARATOR
|
|
|
540 |
|| (0x202A <= $codePoint && 0x202E >= $codePoint) // LEFT-TO-RIGHT EMBEDDING to RIGHT-TO-LEFT OVERRIDE
|
|
|
541 |
|| (0x2060 <= $codePoint && 0x2063 >= $codePoint) // WORD JOINER to INVISIBLE SEPARATOR
|
|
|
542 |
) {
|
|
|
543 |
continue;
|
|
|
544 |
}
|
|
|
545 |
|
|
|
546 |
// Non printable characters
|
|
|
547 |
if (32 > $codePoint // C0 control characters
|
|
|
548 |
|| (0x07F <= $codePoint && 0x0A0 > $codePoint) // C1 control characters and DEL
|
|
|
549 |
) {
|
|
|
550 |
return -1;
|
|
|
551 |
}
|
|
|
552 |
|
|
|
553 |
self::$tableZero ??= require __DIR__.'/Resources/data/wcswidth_table_zero.php';
|
|
|
554 |
|
|
|
555 |
if ($codePoint >= self::$tableZero[0][0] && $codePoint <= self::$tableZero[$ubound = \count(self::$tableZero) - 1][1]) {
|
|
|
556 |
$lbound = 0;
|
|
|
557 |
while ($ubound >= $lbound) {
|
|
|
558 |
$mid = floor(($lbound + $ubound) / 2);
|
|
|
559 |
|
|
|
560 |
if ($codePoint > self::$tableZero[$mid][1]) {
|
|
|
561 |
$lbound = $mid + 1;
|
|
|
562 |
} elseif ($codePoint < self::$tableZero[$mid][0]) {
|
|
|
563 |
$ubound = $mid - 1;
|
|
|
564 |
} else {
|
|
|
565 |
continue 2;
|
|
|
566 |
}
|
|
|
567 |
}
|
|
|
568 |
}
|
|
|
569 |
|
|
|
570 |
self::$tableWide ??= require __DIR__.'/Resources/data/wcswidth_table_wide.php';
|
|
|
571 |
|
|
|
572 |
if ($codePoint >= self::$tableWide[0][0] && $codePoint <= self::$tableWide[$ubound = \count(self::$tableWide) - 1][1]) {
|
|
|
573 |
$lbound = 0;
|
|
|
574 |
while ($ubound >= $lbound) {
|
|
|
575 |
$mid = floor(($lbound + $ubound) / 2);
|
|
|
576 |
|
|
|
577 |
if ($codePoint > self::$tableWide[$mid][1]) {
|
|
|
578 |
$lbound = $mid + 1;
|
|
|
579 |
} elseif ($codePoint < self::$tableWide[$mid][0]) {
|
|
|
580 |
$ubound = $mid - 1;
|
|
|
581 |
} else {
|
|
|
582 |
$width += 2;
|
|
|
583 |
|
|
|
584 |
continue 2;
|
|
|
585 |
}
|
|
|
586 |
}
|
|
|
587 |
}
|
|
|
588 |
|
|
|
589 |
++$width;
|
|
|
590 |
}
|
|
|
591 |
|
|
|
592 |
return $width;
|
|
|
593 |
}
|
|
|
594 |
}
|