LexerTest.php 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904
  1. <?php
  2. class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
  3. {
  4. protected $_has_pear = false;
  5. public function __construct()
  6. {
  7. parent::__construct();
  8. if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  9. require_once 'HTMLPurifier/Lexer/PH5P.php';
  10. }
  11. }
  12. // HTMLPurifier_Lexer::create() --------------------------------------------
  13. public function test_create()
  14. {
  15. $this->config->set('Core.MaintainLineNumbers', true);
  16. $lexer = HTMLPurifier_Lexer::create($this->config);
  17. $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  18. }
  19. public function test_create_objectLexerImpl()
  20. {
  21. $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
  22. $lexer = HTMLPurifier_Lexer::create($this->config);
  23. $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  24. }
  25. public function test_create_unknownLexer()
  26. {
  27. $this->config->set('Core.LexerImpl', 'AsdfAsdf');
  28. $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
  29. HTMLPurifier_Lexer::create($this->config);
  30. }
  31. public function test_create_incompatibleLexer()
  32. {
  33. $this->config->set('Core.LexerImpl', 'DOMLex');
  34. $this->config->set('Core.MaintainLineNumbers', true);
  35. $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
  36. HTMLPurifier_Lexer::create($this->config);
  37. }
  38. // HTMLPurifier_Lexer->parseData() -----------------------------------------
  39. public function assertParseData($input, $expect = true, $is_attr = false)
  40. {
  41. if ($expect === true) $expect = $input;
  42. $lexer = new HTMLPurifier_Lexer();
  43. $this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config));
  44. }
  45. public function test_parseData_plainText()
  46. {
  47. $this->assertParseData('asdf');
  48. }
  49. public function test_parseData_ampersandEntity()
  50. {
  51. $this->assertParseData('&amp;', '&');
  52. }
  53. public function test_parseData_quotEntity()
  54. {
  55. $this->assertParseData('&quot;', '"');
  56. }
  57. public function test_parseData_aposNumericEntity()
  58. {
  59. $this->assertParseData('&#039;', "'");
  60. }
  61. public function test_parseData_aposCompactNumericEntity()
  62. {
  63. $this->assertParseData('&#39;', "'");
  64. }
  65. public function test_parseData_adjacentAmpersandEntities()
  66. {
  67. $this->assertParseData('&amp;&amp;&amp;', '&&&');
  68. }
  69. public function test_parseData_trailingUnescapedAmpersand()
  70. {
  71. $this->assertParseData('&amp;&', '&&');
  72. }
  73. public function test_parseData_internalUnescapedAmpersand()
  74. {
  75. $this->assertParseData('Procter & Gamble');
  76. }
  77. public function test_parseData_improperEntityFaultToleranceTest()
  78. {
  79. $this->assertParseData('&#x2D;', '-');
  80. }
  81. public function test_parseData_noTrailingSemi()
  82. {
  83. $this->assertParseData('&ampA', '&A');
  84. }
  85. public function test_parseData_noTrailingSemiAttr()
  86. {
  87. $this->assertParseData('&ampA', '&ampA', true);
  88. }
  89. public function test_parseData_T119()
  90. {
  91. $this->assertParseData('&ampA', '&ampA', true);
  92. }
  93. public function test_parseData_T119b()
  94. {
  95. $this->assertParseData('&trade=', true, true);
  96. }
  97. public function test_parseData_legacy1()
  98. {
  99. $this->config->set('Core.LegacyEntityDecoder', true);
  100. $this->assertParseData('&ampa', true);
  101. $this->assertParseData('&amp=', "&=");
  102. $this->assertParseData('&ampa', true, true);
  103. $this->assertParseData('&amp=', "&=", true);
  104. $this->assertParseData('&lta', true);
  105. $this->assertParseData('&lt=', "<=");
  106. $this->assertParseData('&lta', true, true);
  107. $this->assertParseData('&lt=', "<=", true);
  108. }
  109. public function test_parseData_nonlegacy1()
  110. {
  111. $this->assertParseData('&ampa', "&a");
  112. $this->assertParseData('&amp=', "&=");
  113. $this->assertParseData('&ampa', true, true);
  114. $this->assertParseData('&amp=', true, true);
  115. $this->assertParseData('&lta', "<a");
  116. $this->assertParseData('&lt=', "<=");
  117. $this->assertParseData('&lta', true, true);
  118. $this->assertParseData('&lt=', true, true);
  119. $this->assertParseData('&lta;', "<a;");
  120. }
  121. public function test_parseData_noTrailingSemiNever()
  122. {
  123. $this->assertParseData('&imath');
  124. }
  125. // HTMLPurifier_Lexer->extractBody() ---------------------------------------
  126. public function assertExtractBody($text, $extract = true)
  127. {
  128. $lexer = new HTMLPurifier_Lexer();
  129. $result = $lexer->extractBody($text);
  130. if ($extract === true) $extract = $text;
  131. $this->assertIdentical($extract, $result);
  132. }
  133. public function test_extractBody_noBodyTags()
  134. {
  135. $this->assertExtractBody('<b>Bold</b>');
  136. }
  137. public function test_extractBody_lowercaseBodyTags()
  138. {
  139. $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
  140. }
  141. public function test_extractBody_uppercaseBodyTags()
  142. {
  143. $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
  144. }
  145. public function test_extractBody_realisticUseCase()
  146. {
  147. $this->assertExtractBody(
  148. '<?xml version="1.0"
  149. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  150. "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  151. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  152. <head>
  153. <title>xyz</title>
  154. </head>
  155. <body>
  156. <form method="post" action="whatever1">
  157. <div>
  158. <input type="text" name="username" />
  159. <input type="text" name="password" />
  160. <input type="submit" />
  161. </div>
  162. </form>
  163. </body>
  164. </html>',
  165. '
  166. <form method="post" action="whatever1">
  167. <div>
  168. <input type="text" name="username" />
  169. <input type="text" name="password" />
  170. <input type="submit" />
  171. </div>
  172. </form>
  173. ');
  174. }
  175. public function test_extractBody_bodyWithAttributes()
  176. {
  177. $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
  178. }
  179. public function test_extractBody_preserveUnclosedBody()
  180. {
  181. $this->assertExtractBody('<body>asdf'); // not closed, don't accept
  182. }
  183. public function test_extractBody_useLastBody()
  184. {
  185. $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
  186. }
  187. public function test_extractBody_ignoreCommented()
  188. {
  189. $this->assertExtractBody('$<!-- <body>foo</body> -->^');
  190. }
  191. public function test_extractBody_butCanStillWork()
  192. {
  193. $this->assertExtractBody('<!-- b --><body>a</body>', 'a');
  194. }
  195. // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
  196. public function assertTokenization($input, $expect, $alt_expect = array())
  197. {
  198. $lexers = array();
  199. $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
  200. if (class_exists('DOMDocument')) {
  201. $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
  202. $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
  203. }
  204. foreach ($lexers as $name => $lexer) {
  205. $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
  206. if (isset($alt_expect[$name])) {
  207. if ($alt_expect[$name] === false) continue;
  208. $t_expect = $alt_expect[$name];
  209. $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
  210. } else {
  211. $t_expect = $expect;
  212. $this->assertIdentical($result, $expect, "$name: %s");
  213. }
  214. if ($t_expect != $result) {
  215. printTokens($result);
  216. }
  217. }
  218. }
  219. public function test_tokenizeHTML_emptyInput()
  220. {
  221. $this->assertTokenization('', array());
  222. }
  223. public function test_tokenizeHTML_plainText()
  224. {
  225. $this->assertTokenization(
  226. 'This is regular text.',
  227. array(
  228. new HTMLPurifier_Token_Text('This is regular text.')
  229. )
  230. );
  231. }
  232. public function test_tokenizeHTML_textAndTags()
  233. {
  234. $this->assertTokenization(
  235. 'This is <b>bold</b> text',
  236. array(
  237. new HTMLPurifier_Token_Text('This is '),
  238. new HTMLPurifier_Token_Start('b', array()),
  239. new HTMLPurifier_Token_Text('bold'),
  240. new HTMLPurifier_Token_End('b'),
  241. new HTMLPurifier_Token_Text(' text'),
  242. )
  243. );
  244. }
  245. public function test_tokenizeHTML_normalizeCase()
  246. {
  247. $this->assertTokenization(
  248. '<DIV>Totally rad dude. <b>asdf</b></div>',
  249. array(
  250. new HTMLPurifier_Token_Start('DIV', array()),
  251. new HTMLPurifier_Token_Text('Totally rad dude. '),
  252. new HTMLPurifier_Token_Start('b', array()),
  253. new HTMLPurifier_Token_Text('asdf'),
  254. new HTMLPurifier_Token_End('b'),
  255. new HTMLPurifier_Token_End('div'),
  256. )
  257. );
  258. }
  259. public function test_tokenizeHTML_notWellFormed()
  260. {
  261. $this->assertTokenization(
  262. '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
  263. array(
  264. new HTMLPurifier_Token_Start('asdf'),
  265. new HTMLPurifier_Token_End('asdf'),
  266. new HTMLPurifier_Token_Start('d'),
  267. new HTMLPurifier_Token_End('d'),
  268. new HTMLPurifier_Token_Start('poOloka'),
  269. new HTMLPurifier_Token_Start('poolasdf'),
  270. new HTMLPurifier_Token_Start('ds'),
  271. new HTMLPurifier_Token_End('asdf'),
  272. new HTMLPurifier_Token_End('ASDF'),
  273. ),
  274. array(
  275. 'DOMLex' => $alt = array(
  276. new HTMLPurifier_Token_Empty('asdf'),
  277. new HTMLPurifier_Token_Empty('d'),
  278. new HTMLPurifier_Token_Start('pooloka'),
  279. new HTMLPurifier_Token_Start('poolasdf'),
  280. new HTMLPurifier_Token_Empty('ds'),
  281. new HTMLPurifier_Token_End('poolasdf'),
  282. new HTMLPurifier_Token_End('pooloka'),
  283. ),
  284. // 20140831: Weird, but whatever...
  285. 'PH5P' => array(new HTMLPurifier_Token_Empty('asdf')),
  286. )
  287. );
  288. }
  289. public function test_tokenizeHTML_whitespaceInTag()
  290. {
  291. $this->assertTokenization(
  292. '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
  293. array(
  294. new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
  295. new HTMLPurifier_Token_Text('Link to '),
  296. new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
  297. new HTMLPurifier_Token_Text('foobar'),
  298. new HTMLPurifier_Token_End('b'),
  299. new HTMLPurifier_Token_End('a'),
  300. )
  301. );
  302. }
  303. public function test_tokenizeHTML_singleAttribute()
  304. {
  305. $this->assertTokenization(
  306. '<br style="&amp;" />',
  307. array(
  308. new HTMLPurifier_Token_Empty('br', array('style' => '&'))
  309. )
  310. );
  311. }
  312. public function test_tokenizeHTML_emptyTag()
  313. {
  314. $this->assertTokenization(
  315. '<br />',
  316. array( new HTMLPurifier_Token_Empty('br') )
  317. );
  318. }
  319. public function test_tokenizeHTML_comment()
  320. {
  321. $this->assertTokenization(
  322. '<!-- Comment -->',
  323. array( new HTMLPurifier_Token_Comment(' Comment ') )
  324. );
  325. }
  326. public function test_tokenizeHTML_malformedComment()
  327. {
  328. $this->assertTokenization(
  329. '<!-- not so well formed --->',
  330. array( new HTMLPurifier_Token_Comment(' not so well formed -') )
  331. );
  332. }
  333. public function test_tokenizeHTML_unterminatedTag()
  334. {
  335. $this->assertTokenization(
  336. '<a href=""',
  337. array( new HTMLPurifier_Token_Text('<a href=""') ),
  338. array(
  339. // I like our behavior better, but it's non-standard
  340. 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
  341. 'PH5P' => false, // total barfing, grabs scaffolding too
  342. )
  343. );
  344. }
  345. public function test_tokenizeHTML_specialEntities()
  346. {
  347. $this->assertTokenization(
  348. '&lt;b&gt;',
  349. array(
  350. new HTMLPurifier_Token_Text('<b>')
  351. ),
  352. array(
  353. // some parsers will separate entities out
  354. 'PH5P' => array(
  355. new HTMLPurifier_Token_Text('<'),
  356. new HTMLPurifier_Token_Text('b'),
  357. new HTMLPurifier_Token_Text('>'),
  358. ),
  359. )
  360. );
  361. }
  362. public function test_tokenizeHTML_earlyQuote()
  363. {
  364. $this->assertTokenization(
  365. '<a "=>',
  366. array( new HTMLPurifier_Token_Empty('a') ),
  367. array(
  368. // we barf on this input
  369. 'DirectLex' => array(
  370. new HTMLPurifier_Token_Start('a', array('"' => ''))
  371. ),
  372. 'PH5P' => false, // behavior varies; handle this personally
  373. )
  374. );
  375. }
  376. public function test_tokenizeHTML_earlyQuote_PH5P()
  377. {
  378. if (!class_exists('DOMDocument')) return;
  379. $lexer = new HTMLPurifier_Lexer_PH5P();
  380. $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
  381. if ($this->context->get('PH5PError', true)) {
  382. $this->assertIdentical(array(
  383. new HTMLPurifier_Token_Start('a', array('"' => ''))
  384. ), $result);
  385. } else {
  386. $this->assertIdentical(array(
  387. new HTMLPurifier_Token_Empty('a', array('"' => ''))
  388. ), $result);
  389. }
  390. }
  391. public function test_tokenizeHTML_unescapedQuote()
  392. {
  393. $this->assertTokenization(
  394. '"',
  395. array( new HTMLPurifier_Token_Text('"') )
  396. );
  397. }
  398. public function test_tokenizeHTML_escapedQuote()
  399. {
  400. $this->assertTokenization(
  401. '&quot;',
  402. array( new HTMLPurifier_Token_Text('"') )
  403. );
  404. }
  405. public function test_tokenizeHTML_cdata()
  406. {
  407. $this->assertTokenization(
  408. '<![CDATA[You <b>can&#39;t</b> get me!]]>',
  409. array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
  410. array(
  411. 'PH5P' => array(
  412. new HTMLPurifier_Token_Text('You '),
  413. new HTMLPurifier_Token_Text('<'),
  414. new HTMLPurifier_Token_Text('b'),
  415. new HTMLPurifier_Token_Text('>'),
  416. new HTMLPurifier_Token_Text('can'),
  417. new HTMLPurifier_Token_Text('&'),
  418. new HTMLPurifier_Token_Text('#39;t'),
  419. new HTMLPurifier_Token_Text('<'),
  420. new HTMLPurifier_Token_Text('/b'),
  421. new HTMLPurifier_Token_Text('>'),
  422. new HTMLPurifier_Token_Text(' get me!'),
  423. ),
  424. )
  425. );
  426. }
  427. public function test_tokenizeHTML_characterEntity()
  428. {
  429. $this->assertTokenization(
  430. '&theta;',
  431. array( new HTMLPurifier_Token_Text("\xCE\xB8") )
  432. );
  433. }
  434. public function test_tokenizeHTML_characterEntityInCDATA()
  435. {
  436. $this->assertTokenization(
  437. '<![CDATA[&rarr;]]>',
  438. array( new HTMLPurifier_Token_Text("&rarr;") ),
  439. array(
  440. 'PH5P' => array(
  441. new HTMLPurifier_Token_Text('&'),
  442. new HTMLPurifier_Token_Text('rarr;'),
  443. ),
  444. )
  445. );
  446. }
  447. public function test_tokenizeHTML_entityInAttribute()
  448. {
  449. $this->assertTokenization(
  450. '<a href="index.php?title=foo&amp;id=bar">Link</a>',
  451. array(
  452. new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
  453. new HTMLPurifier_Token_Text('Link'),
  454. new HTMLPurifier_Token_End('a'),
  455. )
  456. );
  457. }
  458. public function test_tokenizeHTML_preserveUTF8()
  459. {
  460. $this->assertTokenization(
  461. "\xCE\xB8",
  462. array( new HTMLPurifier_Token_Text("\xCE\xB8") )
  463. );
  464. }
  465. public function test_tokenizeHTML_specialEntityInAttribute()
  466. {
  467. $this->assertTokenization(
  468. '<br test="x &lt; 6" />',
  469. array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
  470. );
  471. }
  472. public function test_tokenizeHTML_emoticonProtection()
  473. {
  474. $this->assertTokenization(
  475. '<b>Whoa! <3 That\'s not good >.></b>',
  476. array(
  477. new HTMLPurifier_Token_Start('b'),
  478. new HTMLPurifier_Token_Text('Whoa! '),
  479. new HTMLPurifier_Token_Text('<'),
  480. new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
  481. new HTMLPurifier_Token_End('b')
  482. ),
  483. array(
  484. // text is absorbed together
  485. 'DOMLex' => array(
  486. new HTMLPurifier_Token_Start('b'),
  487. new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
  488. new HTMLPurifier_Token_End('b'),
  489. ),
  490. 'PH5P' => array( // interesting grouping
  491. new HTMLPurifier_Token_Start('b'),
  492. new HTMLPurifier_Token_Text('Whoa! '),
  493. new HTMLPurifier_Token_Text('<'),
  494. new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
  495. new HTMLPurifier_Token_End('b'),
  496. ),
  497. )
  498. );
  499. }
  500. public function test_tokenizeHTML_commentWithFunkyChars()
  501. {
  502. $this->assertTokenization(
  503. '<!-- This >< comment --><br />',
  504. array(
  505. new HTMLPurifier_Token_Comment(' This >< comment '),
  506. new HTMLPurifier_Token_Empty('br'),
  507. )
  508. );
  509. }
  510. public function test_tokenizeHTML_unterminatedComment()
  511. {
  512. $this->assertTokenization(
  513. '<!-- This >< comment',
  514. array( new HTMLPurifier_Token_Comment(' This >< comment') ),
  515. array(
  516. 'DOMLex' => false,
  517. 'PH5P' => false,
  518. )
  519. );
  520. }
  521. public function test_tokenizeHTML_scriptCDATAContents()
  522. {
  523. $this->config->set('HTML.Trusted', true);
  524. $this->assertTokenization(
  525. 'Foo: <script>alert("<foo>");</script>',
  526. array(
  527. new HTMLPurifier_Token_Text('Foo: '),
  528. new HTMLPurifier_Token_Start('script'),
  529. new HTMLPurifier_Token_Text('alert("<foo>");'),
  530. new HTMLPurifier_Token_End('script'),
  531. ),
  532. array(
  533. // PH5P, for some reason, bubbles the script to <head>
  534. 'PH5P' => false,
  535. )
  536. );
  537. }
  538. public function test_tokenizeHTML_entitiesInComment()
  539. {
  540. $this->assertTokenization(
  541. '<!-- This comment < &lt; & -->',
  542. array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') )
  543. );
  544. }
  545. public function test_tokenizeHTML_attributeWithSpecialCharacters()
  546. {
  547. $this->assertTokenization(
  548. '<a href="><>">',
  549. array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
  550. array(
  551. 'DirectLex' => array(
  552. new HTMLPurifier_Token_Start('a', array('href' => '')),
  553. new HTMLPurifier_Token_Text('<'),
  554. new HTMLPurifier_Token_Text('">'),
  555. )
  556. )
  557. );
  558. }
  559. public function test_tokenizeHTML_emptyTagWithSlashInAttribute()
  560. {
  561. $this->assertTokenization(
  562. '<param name="src" value="http://example.com/video.wmv" />',
  563. array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
  564. );
  565. }
  566. public function test_tokenizeHTML_style()
  567. {
  568. $extra = array(
  569. // PH5P doesn't seem to like style tags
  570. 'PH5P' => false,
  571. // DirectLex defers to RemoveForeignElements for textification
  572. 'DirectLex' => array(
  573. new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
  574. new HTMLPurifier_Token_Comment("\ndiv {}\n"),
  575. new HTMLPurifier_Token_End('style'),
  576. ),
  577. );
  578. if (!defined('LIBXML_VERSION')) {
  579. // LIBXML_VERSION is missing in early versions of PHP
  580. // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
  581. // this translates to 5.0.x. In such cases, punt the test entirely.
  582. return;
  583. } elseif (LIBXML_VERSION < 20628) {
  584. // libxml's behavior is wrong prior to this version, so make
  585. // appropriate accomodations
  586. $extra['DOMLex'] = $extra['DirectLex'];
  587. }
  588. $this->assertTokenization(
  589. '<style type="text/css"><!--
  590. div {}
  591. --></style>',
  592. array(
  593. new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
  594. new HTMLPurifier_Token_Text("\ndiv {}\n"),
  595. new HTMLPurifier_Token_End('style'),
  596. ),
  597. $extra
  598. );
  599. }
  600. public function test_tokenizeHTML_tagWithAtSignAndExtraGt()
  601. {
  602. $alt_expect = array(
  603. // Technically this is invalid, but it won't be a
  604. // problem with invalid element removal; also, this
  605. // mimics Mozilla's parsing of the tag.
  606. new HTMLPurifier_Token_Start('a@'),
  607. new HTMLPurifier_Token_Text('>'),
  608. );
  609. $this->assertTokenization(
  610. '<a@>>',
  611. array(
  612. new HTMLPurifier_Token_Start('a'),
  613. new HTMLPurifier_Token_Text('>'),
  614. new HTMLPurifier_Token_End('a'),
  615. ),
  616. array(
  617. 'DirectLex' => $alt_expect,
  618. )
  619. );
  620. }
  621. public function test_tokenizeHTML_emoticonHeart()
  622. {
  623. $this->assertTokenization(
  624. '<br /><3<br />',
  625. array(
  626. new HTMLPurifier_Token_Empty('br'),
  627. new HTMLPurifier_Token_Text('<'),
  628. new HTMLPurifier_Token_Text('3'),
  629. new HTMLPurifier_Token_Empty('br'),
  630. ),
  631. array(
  632. 'DOMLex' => array(
  633. new HTMLPurifier_Token_Empty('br'),
  634. new HTMLPurifier_Token_Text('<3'),
  635. new HTMLPurifier_Token_Empty('br'),
  636. ),
  637. )
  638. );
  639. }
  640. public function test_tokenizeHTML_emoticonShiftyEyes()
  641. {
  642. $this->assertTokenization(
  643. '<b><<</b>',
  644. array(
  645. new HTMLPurifier_Token_Start('b'),
  646. new HTMLPurifier_Token_Text('<'),
  647. new HTMLPurifier_Token_Text('<'),
  648. new HTMLPurifier_Token_End('b'),
  649. ),
  650. array(
  651. 'DOMLex' => array(
  652. new HTMLPurifier_Token_Start('b'),
  653. new HTMLPurifier_Token_Text('<<'),
  654. new HTMLPurifier_Token_End('b'),
  655. ),
  656. )
  657. );
  658. }
  659. public function test_tokenizeHTML_eon1996()
  660. {
  661. $this->assertTokenization(
  662. '< <b>test</b>',
  663. array(
  664. new HTMLPurifier_Token_Text('<'),
  665. new HTMLPurifier_Token_Text(' '),
  666. new HTMLPurifier_Token_Start('b'),
  667. new HTMLPurifier_Token_Text('test'),
  668. new HTMLPurifier_Token_End('b'),
  669. ),
  670. array(
  671. 'DOMLex' => array(
  672. new HTMLPurifier_Token_Text('< '),
  673. new HTMLPurifier_Token_Start('b'),
  674. new HTMLPurifier_Token_Text('test'),
  675. new HTMLPurifier_Token_End('b'),
  676. ),
  677. )
  678. );
  679. }
  680. public function test_tokenizeHTML_bodyInCDATA()
  681. {
  682. $alt_tokens = array(
  683. new HTMLPurifier_Token_Text('<'),
  684. new HTMLPurifier_Token_Text('body'),
  685. new HTMLPurifier_Token_Text('>'),
  686. new HTMLPurifier_Token_Text('Foo'),
  687. new HTMLPurifier_Token_Text('<'),
  688. new HTMLPurifier_Token_Text('/body'),
  689. new HTMLPurifier_Token_Text('>'),
  690. );
  691. $this->assertTokenization(
  692. '<![CDATA[<body>Foo</body>]]>',
  693. array(
  694. new HTMLPurifier_Token_Text('<body>Foo</body>'),
  695. ),
  696. array(
  697. 'PH5P' => $alt_tokens,
  698. )
  699. );
  700. }
  701. public function test_tokenizeHTML_()
  702. {
  703. $this->assertTokenization(
  704. '<a><img /></a>',
  705. array(
  706. new HTMLPurifier_Token_Start('a'),
  707. new HTMLPurifier_Token_Empty('img'),
  708. new HTMLPurifier_Token_End('a'),
  709. )
  710. );
  711. }
  712. public function test_tokenizeHTML_ignoreIECondComment()
  713. {
  714. $this->assertTokenization(
  715. '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
  716. array()
  717. );
  718. }
  719. public function test_tokenizeHTML_removeProcessingInstruction()
  720. {
  721. $this->config->set('Core.RemoveProcessingInstructions', true);
  722. $this->assertTokenization(
  723. '<?xml blah blah ?>',
  724. array()
  725. );
  726. }
  727. public function test_tokenizeHTML_removeNewline()
  728. {
  729. $this->config->set('Core.NormalizeNewlines', true);
  730. $this->assertTokenization(
  731. "plain\rtext\r\n",
  732. array(
  733. new HTMLPurifier_Token_Text("plain\ntext\n")
  734. )
  735. );
  736. }
  737. public function test_tokenizeHTML_noRemoveNewline()
  738. {
  739. $this->config->set('Core.NormalizeNewlines', false);
  740. $this->assertTokenization(
  741. "plain\rtext\r\n",
  742. array(
  743. new HTMLPurifier_Token_Text("plain\rtext\r\n")
  744. )
  745. );
  746. }
  747. public function test_tokenizeHTML_conditionalCommentUngreedy()
  748. {
  749. $this->assertTokenization(
  750. '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
  751. array(
  752. new HTMLPurifier_Token_Text("b")
  753. )
  754. );
  755. }
  756. public function test_tokenizeHTML_imgTag()
  757. {
  758. $start = array(
  759. new HTMLPurifier_Token_Start('img',
  760. array(
  761. 'src' => 'img_11775.jpg',
  762. 'alt' => '[Img #11775]',
  763. 'id' => 'EMBEDDED_IMG_11775',
  764. )
  765. )
  766. );
  767. $this->assertTokenization(
  768. '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
  769. array(
  770. new HTMLPurifier_Token_Empty('img',
  771. array(
  772. 'src' => 'img_11775.jpg',
  773. 'alt' => '[Img #11775]',
  774. 'id' => 'EMBEDDED_IMG_11775',
  775. )
  776. )
  777. ),
  778. array(
  779. 'DirectLex' => $start,
  780. )
  781. );
  782. }
  783. public function test_tokenizeHTML_prematureDivClose()
  784. {
  785. $this->assertTokenization(
  786. '</div>dont<b>die</b>',
  787. array(
  788. new HTMLPurifier_Token_End('div'),
  789. new HTMLPurifier_Token_Text('dont'),
  790. new HTMLPurifier_Token_Start('b'),
  791. new HTMLPurifier_Token_Text('die'),
  792. new HTMLPurifier_Token_End('b'),
  793. ),
  794. array(
  795. 'DOMLex' => $alt = array(
  796. new HTMLPurifier_Token_Text('dont'),
  797. new HTMLPurifier_Token_Start('b'),
  798. new HTMLPurifier_Token_Text('die'),
  799. new HTMLPurifier_Token_End('b')
  800. ),
  801. 'PH5P' => $alt
  802. )
  803. );
  804. }
  805. /*
  806. public function test_tokenizeHTML_()
  807. {
  808. $this->assertTokenization(
  809. ,
  810. array(
  811. )
  812. );
  813. }
  814. */
  815. }
  816. // vim: et sw=4 sts=4