Strings.fixEncoding().phpt 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. <?php
  2. /**
  3. * Test: Nette\Utils\Strings::fixEncoding()
  4. */
  5. declare(strict_types=1);
  6. use Nette\Utils\Strings;
  7. use Tester\Assert;
  8. require __DIR__ . '/../bootstrap.php';
  9. // Based on "UTF-8 decoder capability and stress test" by Markus Kuhn
  10. // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
  11. $tests = [
  12. '1 Some correct UTF-8 text' => [
  13. "\u{3BA}\u{1F79}\u{3C3}\u{3BC}\u{3B5}",
  14. "\u{3BA}\u{1F79}\u{3C3}\u{3BC}\u{3B5}",
  15. ],
  16. '2 Boundary condition test cases' => [
  17. '2.1 First possible sequence of a certain length' => [
  18. '2.1.1 1 byte (U-00000000)' => [
  19. "\x00",
  20. "\x00",
  21. ],
  22. '2.1.2 2 bytes (U-00000080)' => [
  23. "\u{80}",
  24. "\u{80}",
  25. ],
  26. '2.1.3 3 bytes (U-00000800)' => [
  27. "\u{800}",
  28. "\u{800}",
  29. ],
  30. '2.1.4 4 bytes (U-00010000)' => [
  31. "\u{10000}",
  32. "\u{10000}",
  33. ],
  34. '2.1.5 5 bytes (U-00200000)' => [
  35. "\xF8\x88\x80\x80\x80",
  36. '',
  37. ],
  38. '2.1.6 6 bytes (U-04000000)' => [
  39. "\xFC\x84\x80\x80\x80\x80",
  40. '',
  41. ],
  42. ],
  43. '2.2 Last possible sequence of a certain length' => [
  44. '2.2.1 1 byte (U-0000007F)' => [
  45. "\x7F",
  46. "\x7F",
  47. ],
  48. '2.2.2 2 bytes (U-000007FF)' => [
  49. "\u{7FF}",
  50. "\u{7FF}",
  51. ],
  52. '2.2.3 3 bytes (U-0000FFFF)' => [
  53. "\u{FFFF}",
  54. "\u{FFFF}",
  55. ],
  56. '2.2.4 4 bytes (U-001FFFFF)' => [
  57. "\xF7\xBF\xBF\xBF",
  58. '',
  59. ],
  60. '2.2.5 5 bytes (U-03FFFFFF)' => [
  61. "\xFB\xBF\xBF\xBF\xBF",
  62. '',
  63. ],
  64. '2.2.6 6 bytes (U-7FFFFFFF)' => [
  65. "\xFD\xBF\xBF\xBF\xBF\xBF",
  66. '',
  67. ],
  68. ],
  69. '2.3 Other boundary conditions' => [
  70. '2.3.1 U-0000D7FF' => [
  71. "\u{D7FF}",
  72. "\u{D7FF}",
  73. ],
  74. '2.3.2 U-0000E000' => [
  75. "\u{E000}",
  76. "\u{E000}",
  77. ],
  78. '2.3.3 U-0000FFFD' => [
  79. "\u{FFFD}",
  80. "\u{FFFD}",
  81. ],
  82. '2.3.4 U-0010FFFF' => [
  83. "\u{10FFFF}",
  84. "\u{10FFFF}",
  85. ],
  86. '2.3.5 U-00110000' => [
  87. "\xF4\x90\x80\x80",
  88. '',
  89. ],
  90. ],
  91. ],
  92. '3 Malformed sequences' => [
  93. '3.1 Unexpected continuation bytes' => [
  94. '3.1.1 First continuation byte 0x80' => [
  95. "\x80",
  96. '',
  97. ],
  98. '3.1.2 Last continuation byte 0xbf' => [
  99. "\xBF",
  100. '',
  101. ],
  102. '3.1.3 2 continuation bytes' => [
  103. "\x80\xBF",
  104. '',
  105. ],
  106. '3.1.4 3 continuation bytes' => [
  107. "\x80\xBF\x80",
  108. '',
  109. ],
  110. '3.1.5 4 continuation bytes' => [
  111. "\x80\xBF\x80\xBF",
  112. '',
  113. ],
  114. '3.1.6 5 continuation bytes' => [
  115. "\x80\xBF\x80\xBF\x80",
  116. '',
  117. ],
  118. '3.1.7 6 continuation bytes' => [
  119. "\x80\xBF\x80\xBF\x80\xBF",
  120. '',
  121. ],
  122. '3.1.8 7 continuation bytes' => [
  123. "\x80\xBF\x80\xBF\x80\xBF\x80",
  124. '',
  125. ],
  126. '3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf)' => [
  127. implode('', range("\x80", "\xBF")),
  128. '',
  129. ],
  130. ],
  131. '3.2 Lonely start characters' => [
  132. '3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character' => [
  133. implode(' ', range("\xC0", "\xDF")) . ' ',
  134. str_repeat(' ', 32),
  135. ],
  136. '3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), each followed by a space character' => [
  137. implode(' ', range("\xE0", "\xEF")) . ' ',
  138. str_repeat(' ', 16),
  139. ],
  140. '3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), each followed by a space character' => [
  141. implode(' ', range("\xF0", "\xF7")) . ' ',
  142. str_repeat(' ', 8),
  143. ],
  144. '3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), each followed by a space character' => [
  145. implode(' ', range("\xF8", "\xFB")) . ' ',
  146. str_repeat(' ', 4),
  147. ],
  148. '3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), each followed by a space character' => [
  149. implode(' ', range("\xFC", "\xFD")) . ' ',
  150. str_repeat(' ', 2),
  151. ],
  152. ],
  153. '3.3 Sequences with last continuation byte missing' => [
  154. '3.3.1 2-byte sequence with last byte missing (U+0000)' => [
  155. "\xC0",
  156. '',
  157. ],
  158. '3.3.2 3-byte sequence with last byte missing (U+0000)' => [
  159. "\xE0\x80",
  160. '',
  161. ],
  162. '3.3.3 4-byte sequence with last byte missing (U+0000)' => [
  163. "\xF0\x80\x80",
  164. '',
  165. ],
  166. '3.3.4 5-byte sequence with last byte missing (U+0000)' => [
  167. "\xF8\x80\x80\x80",
  168. '',
  169. ],
  170. '3.3.5 6-byte sequence with last byte missing (U+0000)' => [
  171. "\xFC\x80\x80\x80\x80",
  172. '',
  173. ],
  174. '3.3.6 2-byte sequence with last byte missing (U-000007FF)' => [
  175. "\xDF",
  176. '',
  177. ],
  178. '3.3.7 3-byte sequence with last byte missing (U-0000FFFF)' => [
  179. "\xEF\xBF",
  180. '',
  181. ],
  182. '3.3.8 4-byte sequence with last byte missing (U-001FFFFF)' => [
  183. "\xF7\xBF\xBF",
  184. '',
  185. ],
  186. '3.3.9 5-byte sequence with last byte missing (U-03FFFFFF)' => [
  187. "\xFB\xBF\xBF\xBF",
  188. '',
  189. ],
  190. '3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)' => [
  191. "\xFD\xBF\xBF\xBF\xBF",
  192. '',
  193. ],
  194. ],
  195. '3.4 Concatenation of incomplete sequences' => [
  196. "\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF",
  197. '',
  198. ],
  199. '3.5 Impossible bytes' => [
  200. '3.5.1 fe' => [
  201. "\xFE",
  202. '',
  203. ],
  204. '3.5.2 ff' => [
  205. "\xFF",
  206. '',
  207. ],
  208. '3.5.3 fe fe ff ff' => [
  209. "\xFE\xFE\xFF\xFF",
  210. '',
  211. ],
  212. ],
  213. ],
  214. '4 Overlong sequences' => [
  215. '4.1 Examples of an overlong ASCII character' => [
  216. '4.1.1 U+002F = c0 af' => [
  217. "\xC0\xAF",
  218. '',
  219. ],
  220. '4.1.2 U+002F = e0 80 af' => [
  221. "\xE0\x80\xAF",
  222. '',
  223. ],
  224. '4.1.3 U+002F = f0 80 80 af' => [
  225. "\xF0\x80\x80\xAF",
  226. '',
  227. ],
  228. '4.1.4 U+002F = f8 80 80 80 af' => [
  229. "\xF8\x80\x80\x80\xAF",
  230. '',
  231. ],
  232. '4.1.5 U+002F = fc 80 80 80 80 af' => [
  233. "\xFC\x80\x80\x80\x80\xAF",
  234. '',
  235. ],
  236. ],
  237. '4.2 Maximum overlong sequences' => [
  238. '4.2.1 U-0000007F = c1 bf' => [
  239. "\xC1\xBF",
  240. '',
  241. ],
  242. '4.2.2 U-000007FF = e0 9f bf' => [
  243. "\xE0\x9F\xBF",
  244. '',
  245. ],
  246. '4.2.3 U-0000FFFF = f0 8f bf bf' => [
  247. "\xF0\x8F\xBF\xBF",
  248. '',
  249. ],
  250. '4.2.4 U-001FFFFF = f8 87 bf bf bf' => [
  251. "\xF8\x87\xBF\xBF\xBF",
  252. '',
  253. ],
  254. '4.2.5 U-03FFFFFF = fc 83 bf bf bf bf' => [
  255. "\xFC\x83\xBF\xBF\xBF\xBF",
  256. '',
  257. ],
  258. ],
  259. '4.3 Overlong representation of the NUL character' => [
  260. '4.3.1 U+0000 = c0 80' => [
  261. "\xC0\x80",
  262. '',
  263. ],
  264. '4.3.2 U+0000 = e0 80 80' => [
  265. "\xE0\x80\x80",
  266. '',
  267. ],
  268. '4.3.3 U+0000 = f0 80 80 80' => [
  269. "\xF0\x80\x80\x80",
  270. '',
  271. ],
  272. '4.3.4 U+0000 = f8 80 80 80 80' => [
  273. "\xF8\x80\x80\x80\x80",
  274. '',
  275. ],
  276. '4.3.5 U+0000 = fc 80 80 80 80 80' => [
  277. "\xFC\x80\x80\x80\x80\x80",
  278. '',
  279. ],
  280. ],
  281. ],
  282. '5 Illegal code positions' => [
  283. '5.1 Single UTF-16 surrogates' => [
  284. '5.1.1 U+D800 = ed a0 80' => [
  285. "\xED\xA0\x80",
  286. '',
  287. ],
  288. '5.1.2 U+DB7F = ed ad bf' => [
  289. "\xED\xAD\xBF",
  290. '',
  291. ],
  292. '5.1.3 U+DB80 = ed ae 80' => [
  293. "\xED\xAE\x80",
  294. '',
  295. ],
  296. '5.1.4 U+DBFF = ed af bf' => [
  297. "\xED\xAF\xBF",
  298. '',
  299. ],
  300. '5.1.5 U+DC00 = ed b0 80' => [
  301. "\xED\xB0\x80",
  302. '',
  303. ],
  304. '5.1.6 U+DF80 = ed be 80' => [
  305. "\xED\xBE\x80",
  306. '',
  307. ],
  308. '5.1.7 U+DFFF = ed bf bf' => [
  309. "\xED\xBF\xBF",
  310. '',
  311. ],
  312. ],
  313. '5.2 Paired UTF-16 surrogates' => [
  314. '5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80' => [
  315. "\xED\xA0\x80\xED\xB0\x80",
  316. '',
  317. ],
  318. '5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf' => [
  319. "\xED\xA0\x80\xED\xBF\xBF",
  320. '',
  321. ],
  322. '5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80' => [
  323. "\xED\xAD\xBF\xED\xB0\x80",
  324. '',
  325. ],
  326. '5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf' => [
  327. "\xED\xAD\xBF\xED\xBF\xBF",
  328. '',
  329. ],
  330. '5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80' => [
  331. "\xED\xAE\x80\xED\xB0\x80",
  332. '',
  333. ],
  334. '5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf' => [
  335. "\xED\xAE\x80\xED\xBF\xBF",
  336. '',
  337. ],
  338. '5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80' => [
  339. "\xED\xAF\xBF\xED\xB0\x80",
  340. '',
  341. ],
  342. '5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf' => [
  343. "\xED\xAF\xBF\xED\xBF\xBF",
  344. '',
  345. ],
  346. ],
  347. // noncharacters are allowed according to http://www.unicode.org/versions/corrigendum9.html
  348. '5.3 Other illegal code positions' => [
  349. '5.3.1 U+FFFE = ef bf be' => [
  350. "\u{FFFE}",
  351. "\u{FFFE}",
  352. ],
  353. '5.3.2 U+FFFF = ef bf bf' => [
  354. "\u{FFFF}",
  355. "\u{FFFF}",
  356. ],
  357. ],
  358. ],
  359. ];
  360. $stack = [$tests];
  361. while ($item = array_pop($stack)) {
  362. if (isset($item[0])) {
  363. [$in, $out, $label] = $item;
  364. echo "$label\n";
  365. Assert::same('a' . $out . 'b', Strings::fixEncoding('a' . $in . 'b'));
  366. } else {
  367. foreach (array_reverse($item) as $label => $tests) {
  368. $stack[] = $tests + (isset($tests[0]) ? [2 => $label] : []);
  369. }
  370. }
  371. }