EncoderTest.php 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. <?php
  2. class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
  3. {
  4. protected $_entity_lookup;
  5. public function setUp()
  6. {
  7. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  8. parent::setUp();
  9. }
  10. public function assertCleanUTF8($string, $expect = null)
  11. {
  12. if ($expect === null) $expect = $string;
  13. $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
  14. $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
  15. }
  16. public function test_cleanUTF8()
  17. {
  18. $this->assertCleanUTF8('Normal string.');
  19. $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
  20. $this->assertCleanUTF8("null byte: \0", 'null byte: ');
  21. $this->assertCleanUTF8("あ(い)う(え)お\0", "あ(い)う(え)お"); // test for issue #122
  22. $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
  23. $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
  24. $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
  25. $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
  26. $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
  27. // invalid codepoints
  28. $this->assertCleanUTF8("\xED\xB0\x80", '');
  29. }
  30. public function test_convertToUTF8_noConvert()
  31. {
  32. // UTF-8 means that we don't touch it
  33. $this->assertIdentical(
  34. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  35. "\xF6", // this is invalid
  36. 'Expected identical [Binary: F6]'
  37. );
  38. }
  39. public function test_convertToUTF8_spuriousEncoding()
  40. {
  41. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  42. $this->config->set('Core.Encoding', 'utf99');
  43. $this->expectError('Invalid encoding utf99');
  44. $this->assertIdentical(
  45. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  46. ''
  47. );
  48. }
  49. public function test_convertToUTF8_iso8859_1()
  50. {
  51. $this->config->set('Core.Encoding', 'ISO-8859-1');
  52. $this->assertIdentical(
  53. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  54. "\xC3\xB6"
  55. );
  56. }
  57. public function test_convertToUTF8_withoutIconv()
  58. {
  59. $this->config->set('Core.Encoding', 'ISO-8859-1');
  60. $this->config->set('Test.ForceNoIconv', true);
  61. $this->assertIdentical(
  62. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  63. "\xC3\xB6"
  64. );
  65. }
  66. public function getZhongWen()
  67. {
  68. return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
  69. }
  70. public function test_convertFromUTF8_utf8()
  71. {
  72. // UTF-8 means that we don't touch it
  73. $this->assertIdentical(
  74. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  75. "\xC3\xB6"
  76. );
  77. }
  78. public function test_convertFromUTF8_iso8859_1()
  79. {
  80. $this->config->set('Core.Encoding', 'ISO-8859-1');
  81. $this->assertIdentical(
  82. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  83. "\xF6",
  84. 'Expected identical [Binary: F6]'
  85. );
  86. }
  87. public function test_convertFromUTF8_iconvNoChars()
  88. {
  89. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  90. $this->config->set('Core.Encoding', 'ISO-8859-1');
  91. $this->assertIdentical(
  92. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  93. " (Chinese)"
  94. );
  95. }
  96. public function test_convertFromUTF8_phpNormal()
  97. {
  98. // Plain PHP implementation has slightly different behavior
  99. $this->config->set('Core.Encoding', 'ISO-8859-1');
  100. $this->config->set('Test.ForceNoIconv', true);
  101. $this->assertIdentical(
  102. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  103. "\xF6",
  104. 'Expected identical [Binary: F6]'
  105. );
  106. }
  107. public function test_convertFromUTF8_phpNoChars()
  108. {
  109. $this->config->set('Core.Encoding', 'ISO-8859-1');
  110. $this->config->set('Test.ForceNoIconv', true);
  111. $this->assertIdentical(
  112. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  113. "?? (Chinese)"
  114. );
  115. }
  116. public function test_convertFromUTF8_withProtection()
  117. {
  118. // Preserve the characters!
  119. $this->config->set('Core.Encoding', 'ISO-8859-1');
  120. $this->config->set('Core.EscapeNonASCIICharacters', true);
  121. $this->assertIdentical(
  122. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  123. "&#20013;&#25991; (Chinese)"
  124. );
  125. }
  126. public function test_convertFromUTF8_withProtectionButUtf8()
  127. {
  128. // Preserve the characters!
  129. $this->config->set('Core.EscapeNonASCIICharacters', true);
  130. $this->assertIdentical(
  131. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  132. "&#20013;&#25991; (Chinese)"
  133. );
  134. }
  135. public function test_convertToASCIIDumbLossless()
  136. {
  137. // Uppercase thorn letter
  138. $this->assertIdentical(
  139. HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
  140. "&#222;orn"
  141. );
  142. $this->assertIdentical(
  143. HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
  144. "an"
  145. );
  146. // test up to four bytes
  147. $this->assertIdentical(
  148. HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
  149. "&#917536;"
  150. );
  151. }
  152. public function assertASCIISupportCheck($enc, $ret)
  153. {
  154. $test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
  155. if ($test === false) return;
  156. $this->assertIdentical(
  157. HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
  158. $ret
  159. );
  160. $this->assertIdentical(
  161. HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
  162. $ret
  163. );
  164. }
  165. public function test_testEncodingSupportsASCII()
  166. {
  167. if (HTMLPurifier_Encoder::iconvAvailable()) {
  168. $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
  169. $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
  170. }
  171. $this->assertASCIISupportCheck('ISO-8859-1', array());
  172. $this->assertASCIISupportCheck('dontexist', array()); // canary
  173. }
  174. public function testShiftJIS()
  175. {
  176. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  177. $this->config->set('Core.Encoding', 'Shift_JIS');
  178. // This actually looks like a Yen, but we're going to treat it differently
  179. $this->assertIdentical(
  180. HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
  181. '\\~'
  182. );
  183. $this->assertIdentical(
  184. HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
  185. '\\~'
  186. );
  187. }
  188. public function testIconvTruncateBug()
  189. {
  190. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  191. if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
  192. $this->config->set('Core.Encoding', 'ISO-8859-1');
  193. $this->assertIdentical(
  194. HTMLPurifier_Encoder::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config, $this->context),
  195. str_repeat('a', 10000)
  196. );
  197. }
  198. public function testIconvChunking()
  199. {
  200. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  201. if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
  202. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
  203. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
  204. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
  205. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
  206. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
  207. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
  208. }
  209. }
  210. // vim: et sw=4 sts=4