Skip to content
Snippets Groups Projects
Commit ed8fbfda authored by megachriz's avatar megachriz Committed by MegaChriz
Browse files

Issue #1428272 by OnkelTem, eosrei, jtsnow, Jerenus, liquidcms, acouch,...

Issue #1428272 by OnkelTem, eosrei, jtsnow, Jerenus, liquidcms, acouch, derhasi, Niremizov, MegaChriz: Added support of encoding conversions to the CSV Parser.
parent b9a7eda0
No related branches found
No related tags found
No related merge requests found
......@@ -209,6 +209,12 @@ Default: FALSE
Description: Flag to stop feeds from using its cURL for http requests. See
http_request_use_curl().
Name: feeds_use_mbstring
Default: TRUE
Description: The extension mbstring is used to convert encodings during parsing.
The reason that this can be turned off is to be able to test Feeds
behavior when the extension is not available.
Glossary
========
......
......@@ -50,6 +50,7 @@ files[] = tests/feeds_fetcher_http.test
files[] = tests/feeds_i18n.test
files[] = tests/feeds_i18n_node.test
files[] = tests/feeds_i18n_taxonomy.test
files[] = tests/feeds_parser_csv.test
files[] = tests/feeds_parser_sitemap.test
files[] = tests/feeds_parser_syndication.test
files[] = tests/feeds_processor_node.test
......
......@@ -60,6 +60,7 @@ function feeds_requirements($phase) {
function feeds_uninstall() {
variable_del('http_request_timeout');
variable_del('feeds_reschedule');
variable_del('feeds_use_mbstring');
}
/**
......
......@@ -66,6 +66,8 @@ class ParserCSVIterator implements Iterator {
*/
class ParserCSV {
private $delimiter;
private $fromEncoding;
private $toEncoding;
private $skipFirstLine;
private $columnNames;
private $timeout;
......@@ -73,9 +75,12 @@ class ParserCSV {
private $startByte;
private $lineLimit;
private $lastLinePos;
private $useMbString;
public function __construct() {
$this->delimiter = ',';
$this->fromEncoding = 'UTF-8';
$this->toEncoding = 'UTF-8';
$this->skipFirstLine = FALSE;
$this->columnNames = FALSE;
$this->timeout = FALSE;
......@@ -84,6 +89,9 @@ class ParserCSV {
$this->lineLimit = 0;
$this->lastLinePos = 0;
ini_set('auto_detect_line_endings', TRUE);
if (extension_loaded('mbstring') && variable_get('feeds_use_mbstring', TRUE)) {
$this->useMbString = TRUE;
}
}
/**
......@@ -94,6 +102,18 @@ class ParserCSV {
$this->delimiter = $delimiter;
}
/**
* Sets the source file encoding.
*
* By default, the encoding is UTF-8.
*
* @param string $encoding
* The encoding to set.
*/
public function setEncoding($encoding) {
$this->fromEncoding = $encoding;
}
/**
* Set this to TRUE if the parser should skip the first line of the CSV text,
* which might be desired if the first line contains the column names.
......@@ -197,7 +217,7 @@ class ParserCSV {
for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) {
// Make really sure we've got lines without trailing newlines.
$line = trim($lineIterator->current(), "\r\n");
$line = trim($this->fixEncoding($lineIterator->current()), "\r\n");
// Skip empty lines.
if (empty($line)) {
......@@ -237,7 +257,7 @@ class ParserCSV {
}
// Ok, so, on with fetching the next line, as mentioned above.
$currentField .= "\n";
$line = trim($lineIterator->current(), "\r\n");
$line = trim($this->fixEncoding($lineIterator->current()), "\r\n");
$currentIndex = 0;
continue;
}
......@@ -325,4 +345,38 @@ class ParserCSV {
}
return $rows;
}
/**
* Converts encoding of input data.
*
* @param string $data
* A chunk of data.
*
* @return string
* The encoded data.
*
* @throws ParserCSVEncodingException
* Thrown when a given encoding does not match.
*/
public function fixEncoding($data) {
if ($this->useMbString) {
if (mb_check_encoding($data, $this->fromEncoding)) {
if ($this->toEncoding != $this->fromEncoding) {
// Convert encoding. The conversion is to UTF-8 by default to prevent
// SQL errors.
$data = mb_convert_encoding($data, $this->toEncoding, $this->fromEncoding);
}
}
else {
throw new ParserCSVEncodingException(t('Source file is not in %encoding encoding.', array('%encoding' => $this->fromEncoding)));
}
}
return $data;
}
}
/**
* Exception thrown when an encoding error occurs during parsing.
*/
class ParserCSVEncodingException extends Exception {}
......@@ -22,6 +22,10 @@ class FeedsCSVParser extends FeedsParser {
$parser = new ParserCSV();
$delimiter = $this->getDelimiterChar($source_config);
$parser->setDelimiter($delimiter);
if (isset($source_config['encoding'])) {
// Encoding can only be set when the mbstring extension is loaded.
$parser->setEncoding($source_config['encoding']);
}
$iterator = new ParserCSVIterator($fetcher_result->getFilePath());
if (empty($source_config['no_headers'])) {
......@@ -114,6 +118,7 @@ class FeedsCSVParser extends FeedsParser {
public function sourceDefaults() {
return array(
'delimiter' => $this->config['delimiter'],
'encoding' => $this->config['encoding'],
'no_headers' => $this->config['no_headers'],
);
}
......@@ -172,6 +177,10 @@ class FeedsCSVParser extends FeedsParser {
'#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'),
'#default_value' => isset($source_config['no_headers']) ? $source_config['no_headers'] : 0,
);
$form['encoding'] = $this->configEncodingForm();
if (isset($source_config['encoding'])) {
$form['encoding']['#default_value'] = $source_config['encoding'];
}
return $form;
}
......@@ -181,6 +190,7 @@ class FeedsCSVParser extends FeedsParser {
public function configDefaults() {
return array(
'delimiter' => ',',
'encoding' => 'UTF-8',
'no_headers' => 0,
);
}
......@@ -203,9 +213,42 @@ class FeedsCSVParser extends FeedsParser {
'#description' => t('Check if the imported CSV file does not start with a header row. If checked, mapping sources must be named \'0\', \'1\', \'2\' etc.'),
'#default_value' => $this->config['no_headers'],
);
$form['encoding'] = $this->configEncodingForm();
return $form;
}
/**
* Builds configuration field for setting file encoding.
*
* If the mbstring extension is not available a markup render array
* will be returned instead.
*
* @return array
* A renderable array.
*/
public function configEncodingForm() {
if (extension_loaded('mbstring') && variable_get('feeds_use_mbstring', TRUE)) {
// Get the system's list of available encodings.
$options = mb_list_encodings();
// Make the key/values the same in the array.
$options = array_combine($options, $options);
// Sort alphabetically not-case sensitive.
natcasesort($options);
return array(
'#type' => 'select',
'#title' => t('File encoding'),
'#description' => t('Performs character encoding conversion from selected option to UTF-8.'),
'#options' => $options,
'#default_value' => $this->config['encoding'],
);
}
else {
return array(
'#markup' => '<em>' . t('PHP mbstring extension must be available for character encoding conversion.') . '</em>',
);
}
}
public function getTemplate() {
$mappings = feeds_importer($this->id)->processor->config['mappings'];
$sources = $uniques = array();
......
<?php
/**
* @file
* Result of encoding_{code}.csv file parsed by ParserCSV.inc
*/
// JSON is used here because it supports unicode literals. PHP does not.
$json = <<<EOT
[
[
"id",
"text"
],
[
"1",
"\u672c\u65e5\u306f\u3044\u3044\u5929\u6c17"
],
[
"2",
"\uff71\uff72\uff73\uff74\uff75"
],
[
"3",
"\u30c6\u30b9\u30c8"
],
[
"4",
"\u2605"
]
]
EOT;
$control_result = json_decode($json);
id,text
1,本日はいい天気
2,アイウエオ
3,テスト
4,★
id,text
1,本日はいい天気
2,アイウエオ
3,テスト
4,★
id,text
1,本日はいい天気
2,アイウエオ
3,テスト
4,★
<?php
/**
* @file
* Contains FeedsCSVParserTestCase.
*/
/**
* Tests the CSV parser using the UI.
*/
class FeedsCSVParserTestCase extends FeedsWebTestCase {
public static function getInfo() {
return array(
'name' => 'CSV parser functional tests',
'description' => 'Tests the CSV parser using the UI.',
'group' => 'Feeds',
);
}
/**
* Tests parsing a CSV when the mbstring extension is not available.
*/
public function testMbstringExtensionDisabled() {
// Set "feeds_use_mbstring" to FALSE to emulate that the mbstring extension
// is not loaded.
variable_set('feeds_use_mbstring', FALSE);
// Remove items after parsing because in < PHP 5.4 processing items with
// encoding issues leads to test failures because check_plain() can only
// handle UTF-8 encoded strings.
// @see feeds_tests_feeds_after_parse()
variable_set('feeds_tests_feeds_after_parse_empty_items', TRUE);
// Create node type.
$node_type = $this->drupalCreateContentType();
// Create and configure importer.
$this->createImporterConfiguration('Content CSV', 'csv');
$this->setPlugin('csv', 'FeedsFileFetcher');
$this->setPlugin('csv', 'FeedsCSVParser');
$this->setSettings('csv', 'FeedsNodeProcessor', array('bundle' => $node_type->type));
$this->addMappings('csv', array(
0 => array(
'source' => 'id',
'target' => 'guid',
),
1 => array(
'source' => 'text',
'target' => 'title',
),
));
// Ensure that on the CSV parser settings page a message is shown about that
// the mbstring extension is not available.
$this->drupalGet('admin/structure/feeds/csv/settings/FeedsCSVParser');
$this->assertNoField('encoding');
$this->assertText('PHP mbstring extension must be available for character encoding conversion.');
// Try to import a CSV file that is not UTF-8 encoded. No encoding warning
// should be shown, but import should fail.
$this->importFile('csv', $this->absolutePath() . '/tests/feeds/encoding_SJIS.csv');
$this->assertNoText('Source file is not in UTF-8 encoding.');
}
/**
* Tests an encoding failure during parsing a CSV.
*/
public function testEncodingFailure() {
// Create node type.
$node_type = $this->drupalCreateContentType();
// Create and configure importer.
$this->createImporterConfiguration('Content CSV', 'csv');
$this->setPlugin('csv', 'FeedsFileFetcher');
$this->setPlugin('csv', 'FeedsCSVParser');
$this->setSettings('csv', 'FeedsNodeProcessor', array('bundle' => $node_type->type));
$this->addMappings('csv', array(
0 => array(
'source' => 'id',
'target' => 'guid',
),
1 => array(
'source' => 'text',
'target' => 'title',
),
));
// Ensure that on the CSV parser settings page a setting for encoding is
// shown.
$this->drupalGet('admin/structure/feeds/csv/settings/FeedsCSVParser');
$this->assertField('encoding');
$this->assertNoText('PHP mbstring extension must be available for character encoding conversion.');
// Try to import a CSV file that is not UTF-8 encoded. Import should be
// halted and an encoding warning should be shown.
$this->importFile('csv', $this->absolutePath() . '/tests/feeds/encoding_SJIS.csv');
$this->assertNoText('Failed importing 4 nodes.');
$this->assertText('Source file is not in UTF-8 encoding.');
}
}
......@@ -350,6 +350,24 @@ function feeds_tests_mapper_unique(FeedsSource $source, $entity_type, $bundle, $
}
}
/**
* Implements hook_feeds_after_parse().
*
* Empties the list of items to import in case the test says that there are
* items in there with encoding issues. These items can not be processed during
* tests without having a test failure because in < PHP 5.4 that would produce
* the following warning:
* htmlspecialchars(): Invalid multibyte sequence in argument
*
* @see FeedsCSVParserTestCase::testMbstringExtensionDisabled()
*/
function feeds_tests_feeds_after_parse(FeedsSource $source, FeedsParserResult $result) {
if (variable_get('feeds_tests_feeds_after_parse_empty_items', FALSE)) {
// Remove all items. No items will be processed.
$result->items = array();
}
}
/**
* Helper class to ensure callbacks can be objects.
*/
......
......@@ -32,6 +32,8 @@ class ParserCSVTest extends DrupalWebTestCase {
$this->_testSimple();
$this->_testBatching();
$this->_testEncodingConversion();
$this->_testEncodingConversionFailure();
}
/**
......@@ -53,6 +55,51 @@ class ParserCSVTest extends DrupalWebTestCase {
}
}
/**
* Simple test of encoding conversion prior to parsing.
*/
protected function _testEncodingConversion() {
// Pull in the $control_result array.
include $this->absolutePath() . '/tests/feeds/encoding.csv.php';
$encodings = $this->getEncodings();
foreach ($encodings as $encoding) {
$file = $this->absolutePath() . "/tests/feeds/encoding_{$encoding}.csv";
$iterator = new ParserCSVIterator($file);
$parser = new ParserCSV();
$parser->setDelimiter(',');
$parser->setEncoding($encoding);
$rows = $parser->parse($iterator);
$this->assertFalse($parser->lastLinePos(), format_string('CSV reports all lines parsed, with encoding: %encoding', array('%encoding' => $encoding)));
$this->assertEqual(md5(serialize($rows)), md5(serialize($control_result)), 'Converted and parsed result matches control result.');
}
}
/**
* Simple test of failed encoding conversion prior to parsing.
*/
protected function _testEncodingConversionFailure() {
// Pull in the $control_result array.
include $this->absolutePath() . '/tests/feeds/encoding.csv.php';
$encodings = $this->getEncodings();
foreach ($encodings as $encoding) {
$file = $this->absolutePath() . "/tests/feeds/encoding_{$encoding}.csv";
$iterator = new ParserCSVIterator($file);
$parser = new ParserCSV();
$parser->setDelimiter(',');
// Attempt to read file as UTF-8.
$parser->setEncoding('UTF-8');
try {
$rows = $parser->parse($iterator);
$this->fail('Incorrect conversion attempt throws exception.');
}
catch (ParserCSVEncodingException $e) {
$this->assertNotNull($e->getMessage(), 'Incorrect conversion attempt throws exception.');
}
}
}
/**
* Test batching.
*/
......@@ -100,4 +147,11 @@ class ParserCSVTest extends DrupalWebTestCase {
'tab' => "\t",
);
}
static function getEncodings() {
return array(
'SJIS-win',
'SJIS',
);
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment