ImportMediawiki.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. <?php
  2. /* vim: set expandtab sw=4 ts=4 sts=4: */
  3. /**
  4. * MediaWiki import plugin for phpMyAdmin
  5. *
  6. * @package PhpMyAdmin-Import
  7. * @subpackage MediaWiki
  8. */
  9. declare(strict_types=1);
  10. namespace PhpMyAdmin\Plugins\Import;
  11. use PhpMyAdmin\Import;
  12. use PhpMyAdmin\Message;
  13. use PhpMyAdmin\Plugins\ImportPlugin;
  14. use PhpMyAdmin\Properties\Plugins\ImportPluginProperties;
  15. /**
  16. * Handles the import for the MediaWiki format
  17. *
  18. * @package PhpMyAdmin-Import
  19. * @subpackage MediaWiki
  20. */
  21. class ImportMediawiki extends ImportPlugin
  22. {
  23. /**
  24. * Whether to analyze tables
  25. *
  26. * @var bool
  27. */
  28. private $_analyze;
  29. /**
  30. * Constructor
  31. */
  32. public function __construct()
  33. {
  34. parent::__construct();
  35. $this->setProperties();
  36. }
  37. /**
  38. * Sets the import plugin properties.
  39. * Called in the constructor.
  40. *
  41. * @return void
  42. */
  43. protected function setProperties()
  44. {
  45. $this->_setAnalyze(false);
  46. if ($GLOBALS['plugin_param'] !== 'table') {
  47. $this->_setAnalyze(true);
  48. }
  49. $importPluginProperties = new ImportPluginProperties();
  50. $importPluginProperties->setText(__('MediaWiki Table'));
  51. $importPluginProperties->setExtension('txt');
  52. $importPluginProperties->setMimeType('text/plain');
  53. $importPluginProperties->setOptions([]);
  54. $importPluginProperties->setOptionsText(__('Options'));
  55. $this->properties = $importPluginProperties;
  56. }
  57. /**
  58. * Handles the whole import logic
  59. *
  60. * @param array $sql_data 2-element array with sql data
  61. *
  62. * @return void
  63. */
  64. public function doImport(array &$sql_data = [])
  65. {
  66. global $error, $timeout_passed, $finished;
  67. // Defaults for parser
  68. // The buffer that will be used to store chunks read from the imported file
  69. $buffer = '';
  70. // Used as storage for the last part of the current chunk data
  71. // Will be appended to the first line of the next chunk, if there is one
  72. $last_chunk_line = '';
  73. // Remembers whether the current buffer line is part of a comment
  74. $inside_comment = false;
  75. // Remembers whether the current buffer line is part of a data comment
  76. $inside_data_comment = false;
  77. // Remembers whether the current buffer line is part of a structure comment
  78. $inside_structure_comment = false;
  79. // MediaWiki only accepts "\n" as row terminator
  80. $mediawiki_new_line = "\n";
  81. // Initialize the name of the current table
  82. $cur_table_name = "";
  83. while (! $finished && ! $error && ! $timeout_passed) {
  84. $data = $this->import->getNextChunk();
  85. if ($data === false) {
  86. // Subtract data we didn't handle yet and stop processing
  87. $GLOBALS['offset'] -= mb_strlen($buffer);
  88. break;
  89. } elseif ($data !== true) {
  90. // Append new data to buffer
  91. $buffer = $data;
  92. unset($data);
  93. // Don't parse string if we're not at the end
  94. // and don't have a new line inside
  95. if (mb_strpos($buffer, $mediawiki_new_line) === false) {
  96. continue;
  97. }
  98. }
  99. // Because of reading chunk by chunk, the first line from the buffer
  100. // contains only a portion of an actual line from the imported file.
  101. // Therefore, we have to append it to the last line from the previous
  102. // chunk. If we are at the first chunk, $last_chunk_line should be empty.
  103. $buffer = $last_chunk_line . $buffer;
  104. // Process the buffer line by line
  105. $buffer_lines = explode($mediawiki_new_line, $buffer);
  106. $full_buffer_lines_count = count($buffer_lines);
  107. // If the reading is not finalised, the final line of the current chunk
  108. // will not be complete
  109. if (! $finished) {
  110. $last_chunk_line = $buffer_lines[--$full_buffer_lines_count];
  111. }
  112. for ($line_nr = 0; $line_nr < $full_buffer_lines_count; ++$line_nr) {
  113. $cur_buffer_line = trim($buffer_lines[$line_nr]);
  114. // If the line is empty, go to the next one
  115. if ($cur_buffer_line === '') {
  116. continue;
  117. }
  118. $first_character = $cur_buffer_line[0];
  119. $matches = [];
  120. // Check beginning of comment
  121. if (! strcmp(mb_substr($cur_buffer_line, 0, 4), "<!--")) {
  122. $inside_comment = true;
  123. continue;
  124. } elseif ($inside_comment) {
  125. // Check end of comment
  126. if (! strcmp(mb_substr($cur_buffer_line, 0, 4), "-->")
  127. ) {
  128. // Only data comments are closed. The structure comments
  129. // will be closed when a data comment begins (in order to
  130. // skip structure tables)
  131. if ($inside_data_comment) {
  132. $inside_data_comment = false;
  133. }
  134. // End comments that are not related to table structure
  135. if (! $inside_structure_comment) {
  136. $inside_comment = false;
  137. }
  138. } else {
  139. // Check table name
  140. $match_table_name = [];
  141. if (preg_match(
  142. "/^Table data for `(.*)`$/",
  143. $cur_buffer_line,
  144. $match_table_name
  145. )
  146. ) {
  147. $cur_table_name = $match_table_name[1];
  148. $inside_data_comment = true;
  149. $inside_structure_comment
  150. = $this->_mngInsideStructComm(
  151. $inside_structure_comment
  152. );
  153. } elseif (preg_match(
  154. "/^Table structure for `(.*)`$/",
  155. $cur_buffer_line,
  156. $match_table_name
  157. )
  158. ) {
  159. // The structure comments will be ignored
  160. $inside_structure_comment = true;
  161. }
  162. }
  163. continue;
  164. } elseif (preg_match('/^\{\|(.*)$/', $cur_buffer_line, $matches)) {
  165. // Check start of table
  166. // This will store all the column info on all rows from
  167. // the current table read from the buffer
  168. $cur_temp_table = [];
  169. // Will be used as storage for the current row in the buffer
  170. // Once all its columns are read, it will be added to
  171. // $cur_temp_table and then it will be emptied
  172. $cur_temp_line = [];
  173. // Helps us differentiate the header columns
  174. // from the normal columns
  175. $in_table_header = false;
  176. // End processing because the current line does not
  177. // contain any column information
  178. } elseif (mb_substr($cur_buffer_line, 0, 2) === '|-'
  179. || mb_substr($cur_buffer_line, 0, 2) === '|+'
  180. || mb_substr($cur_buffer_line, 0, 2) === '|}'
  181. ) {
  182. // Check begin row or end table
  183. // Add current line to the values storage
  184. if (! empty($cur_temp_line)) {
  185. // If the current line contains header cells
  186. // ( marked with '!' ),
  187. // it will be marked as table header
  188. if ($in_table_header) {
  189. // Set the header columns
  190. $cur_temp_table_headers = $cur_temp_line;
  191. } else {
  192. // Normal line, add it to the table
  193. $cur_temp_table[] = $cur_temp_line;
  194. }
  195. }
  196. // Empty the temporary buffer
  197. $cur_temp_line = [];
  198. // No more processing required at the end of the table
  199. if (mb_substr($cur_buffer_line, 0, 2) === '|}') {
  200. $current_table = [
  201. $cur_table_name,
  202. $cur_temp_table_headers,
  203. $cur_temp_table,
  204. ];
  205. // Import the current table data into the database
  206. $this->_importDataOneTable($current_table, $sql_data);
  207. // Reset table name
  208. $cur_table_name = "";
  209. }
  210. // What's after the row tag is now only attributes
  211. } elseif (($first_character === '|') || ($first_character === '!')) {
  212. // Check cell elements
  213. // Header cells
  214. if ($first_character === '!') {
  215. // Mark as table header, but treat as normal row
  216. $cur_buffer_line = str_replace('!!', '||', $cur_buffer_line);
  217. // Will be used to set $cur_temp_line as table header
  218. $in_table_header = true;
  219. } else {
  220. $in_table_header = false;
  221. }
  222. // Loop through each table cell
  223. $cells = $this->_explodeMarkup($cur_buffer_line);
  224. foreach ($cells as $cell) {
  225. $cell = $this->_getCellData($cell);
  226. // Delete the beginning of the column, if there is one
  227. $cell = trim($cell);
  228. $col_start_chars = [
  229. "|",
  230. "!",
  231. ];
  232. foreach ($col_start_chars as $col_start_char) {
  233. $cell = $this->_getCellContent($cell, $col_start_char);
  234. }
  235. // Add the cell to the row
  236. $cur_temp_line[] = $cell;
  237. } // foreach $cells
  238. } else {
  239. // If it's none of the above, then the current line has a bad
  240. // format
  241. $message = Message::error(
  242. __('Invalid format of mediawiki input on line: <br>%s.')
  243. );
  244. $message->addParam($cur_buffer_line);
  245. $error = true;
  246. }
  247. } // End treating full buffer lines
  248. } // while - finished parsing buffer
  249. }
  250. /**
  251. * Imports data from a single table
  252. *
  253. * @param array $table containing all table info:
  254. * <code> $table[0] - string
  255. * containing table name
  256. * $table[1] - array[] of
  257. * table headers $table[2] -
  258. * array[][] of table content
  259. * rows </code>
  260. *
  261. * @param array $sql_data 2-element array with sql data
  262. *
  263. * @global bool $analyze whether to scan for column types
  264. *
  265. * @return void
  266. */
  267. private function _importDataOneTable(array $table, array &$sql_data)
  268. {
  269. $analyze = $this->_getAnalyze();
  270. if ($analyze) {
  271. // Set the table name
  272. $this->_setTableName($table[0]);
  273. // Set generic names for table headers if they don't exist
  274. $this->_setTableHeaders($table[1], $table[2][0]);
  275. // Create the tables array to be used in Import::buildSql()
  276. $tables = [];
  277. $tables[] = [
  278. $table[0],
  279. $table[1],
  280. $table[2],
  281. ];
  282. // Obtain the best-fit MySQL types for each column
  283. $analyses = [];
  284. $analyses[] = $this->import->analyzeTable($tables[0]);
  285. $this->_executeImportTables($tables, $analyses, $sql_data);
  286. }
  287. // Commit any possible data in buffers
  288. $this->import->runQuery('', '', $sql_data);
  289. }
  290. /**
  291. * Sets the table name
  292. *
  293. * @param string $table_name reference to the name of the table
  294. *
  295. * @return void
  296. */
  297. private function _setTableName(&$table_name)
  298. {
  299. if (empty($table_name)) {
  300. $result = $GLOBALS['dbi']->fetchResult('SHOW TABLES');
  301. // todo check if the name below already exists
  302. $table_name = 'TABLE ' . (count($result) + 1);
  303. }
  304. }
  305. /**
  306. * Set generic names for table headers, if they don't exist
  307. *
  308. * @param array $table_headers reference to the array containing the headers
  309. * of a table
  310. * @param array $table_row array containing the first content row
  311. *
  312. * @return void
  313. */
  314. private function _setTableHeaders(array &$table_headers, array $table_row)
  315. {
  316. if (empty($table_headers)) {
  317. // The first table row should contain the number of columns
  318. // If they are not set, generic names will be given (COL 1, COL 2, etc)
  319. $num_cols = count($table_row);
  320. for ($i = 0; $i < $num_cols; ++$i) {
  321. $table_headers[$i] = 'COL ' . ($i + 1);
  322. }
  323. }
  324. }
  325. /**
  326. * Sets the database name and additional options and calls Import::buildSql()
  327. * Used in PMA_importDataAllTables() and $this->_importDataOneTable()
  328. *
  329. * @param array $tables structure:
  330. * array(
  331. * array(table_name, array() column_names, array()()
  332. * rows)
  333. * )
  334. * @param array $analyses structure:
  335. * $analyses = array(
  336. * array(array() column_types, array() column_sizes)
  337. * )
  338. * @param array $sql_data 2-element array with sql data
  339. *
  340. * @global string $db name of the database to import in
  341. *
  342. * @return void
  343. */
  344. private function _executeImportTables(array &$tables, array &$analyses, array &$sql_data)
  345. {
  346. global $db;
  347. // $db_name : The currently selected database name, if applicable
  348. // No backquotes
  349. // $options : An associative array of options
  350. list($db_name, $options) = $this->getDbnameAndOptions($db, 'mediawiki_DB');
  351. // Array of SQL strings
  352. // Non-applicable parameters
  353. $create = null;
  354. // Create and execute necessary SQL statements from data
  355. $this->import->buildSql($db_name, $tables, $analyses, $create, $options, $sql_data);
  356. }
  357. /**
  358. * Replaces all instances of the '||' separator between delimiters
  359. * in a given string
  360. *
  361. * @param string $replace the string to be replaced with
  362. * @param string $subject the text to be replaced
  363. *
  364. * @return string with replacements
  365. */
  366. private function _delimiterReplace($replace, $subject)
  367. {
  368. // String that will be returned
  369. $cleaned = "";
  370. // Possible states of current character
  371. $inside_tag = false;
  372. $inside_attribute = false;
  373. // Attributes can be declared with either " or '
  374. $start_attribute_character = false;
  375. // The full separator is "||";
  376. // This remembers if the previous character was '|'
  377. $partial_separator = false;
  378. // Parse text char by char
  379. for ($i = 0, $iMax = strlen($subject); $i < $iMax; $i++) {
  380. $cur_char = $subject[$i];
  381. // Check for separators
  382. if ($cur_char == '|') {
  383. // If we're not inside a tag, then this is part of a real separator,
  384. // so we append it to the current segment
  385. if (! $inside_attribute) {
  386. $cleaned .= $cur_char;
  387. if ($partial_separator) {
  388. $inside_tag = false;
  389. $inside_attribute = false;
  390. }
  391. } elseif ($partial_separator) {
  392. // If we are inside a tag, we replace the current char with
  393. // the placeholder and append that to the current segment
  394. $cleaned .= $replace;
  395. }
  396. // If the previous character was also '|', then this ends a
  397. // full separator. If not, this may be the beginning of one
  398. $partial_separator = ! $partial_separator;
  399. } else {
  400. // If we're inside a tag attribute and the current character is
  401. // not '|', but the previous one was, it means that the single '|'
  402. // was not appended, so we append it now
  403. if ($partial_separator && $inside_attribute) {
  404. $cleaned .= "|";
  405. }
  406. // If the char is different from "|", no separator can be formed
  407. $partial_separator = false;
  408. // any other character should be appended to the current segment
  409. $cleaned .= $cur_char;
  410. if ($cur_char == '<' && ! $inside_attribute) {
  411. // start of a tag
  412. $inside_tag = true;
  413. } elseif ($cur_char == '>' && ! $inside_attribute) {
  414. // end of a tag
  415. $inside_tag = false;
  416. } elseif (($cur_char == '"' || $cur_char == "'") && $inside_tag) {
  417. // start or end of an attribute
  418. if (! $inside_attribute) {
  419. $inside_attribute = true;
  420. // remember the attribute`s declaration character (" or ')
  421. $start_attribute_character = $cur_char;
  422. } else {
  423. if ($cur_char == $start_attribute_character) {
  424. $inside_attribute = false;
  425. // unset attribute declaration character
  426. $start_attribute_character = false;
  427. }
  428. }
  429. }
  430. }
  431. } // end for each character in $subject
  432. return $cleaned;
  433. }
  434. /**
  435. * Separates a string into items, similarly to explode
  436. * Uses the '||' separator (which is standard in the mediawiki format)
  437. * and ignores any instances of it inside markup tags
  438. * Used in parsing buffer lines containing data cells
  439. *
  440. * @param string $text text to be split
  441. *
  442. * @return array
  443. */
  444. private function _explodeMarkup($text)
  445. {
  446. $separator = "||";
  447. $placeholder = "\x00";
  448. // Remove placeholder instances
  449. $text = str_replace($placeholder, '', $text);
  450. // Replace instances of the separator inside HTML-like
  451. // tags with the placeholder
  452. $cleaned = $this->_delimiterReplace($placeholder, $text);
  453. // Explode, then put the replaced separators back in
  454. $items = explode($separator, $cleaned);
  455. foreach ($items as $i => $str) {
  456. $items[$i] = str_replace($placeholder, $separator, $str);
  457. }
  458. return $items;
  459. }
  460. /* ~~~~~~~~~~~~~~~~~~~~ Getters and Setters ~~~~~~~~~~~~~~~~~~~~ */
  461. /**
  462. * Returns true if the table should be analyzed, false otherwise
  463. *
  464. * @return bool
  465. */
  466. private function _getAnalyze()
  467. {
  468. return $this->_analyze;
  469. }
  470. /**
  471. * Sets to true if the table should be analyzed, false otherwise
  472. *
  473. * @param bool $analyze status
  474. *
  475. * @return void
  476. */
  477. private function _setAnalyze($analyze)
  478. {
  479. $this->_analyze = $analyze;
  480. }
  481. /**
  482. * Get cell
  483. *
  484. * @param string $cell Cell
  485. *
  486. * @return mixed
  487. */
  488. private function _getCellData($cell)
  489. {
  490. // A cell could contain both parameters and data
  491. $cell_data = explode('|', $cell, 2);
  492. // A '|' inside an invalid link should not
  493. // be mistaken as delimiting cell parameters
  494. if (mb_strpos($cell_data[0], '[[') === false) {
  495. return $cell;
  496. }
  497. if (count($cell_data) === 1) {
  498. return $cell_data[0];
  499. }
  500. return $cell_data[1];
  501. }
  502. /**
  503. * Manage $inside_structure_comment
  504. *
  505. * @param boolean $inside_structure_comment Value to test
  506. *
  507. * @return bool
  508. */
  509. private function _mngInsideStructComm($inside_structure_comment)
  510. {
  511. // End ignoring structure rows
  512. if ($inside_structure_comment) {
  513. $inside_structure_comment = false;
  514. }
  515. return $inside_structure_comment;
  516. }
  517. /**
  518. * Get cell content
  519. *
  520. * @param string $cell Cell
  521. * @param string $col_start_char Start char
  522. *
  523. * @return string
  524. */
  525. private function _getCellContent($cell, $col_start_char)
  526. {
  527. if (mb_strpos($cell, $col_start_char) === 0) {
  528. $cell = trim(mb_substr($cell, 1));
  529. }
  530. return $cell;
  531. }
  532. }