You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by gv...@apache.org on 2018/03/07 15:46:33 UTC
carbondata git commit: [CARBONDATA-2234] Support UTF-8 with BOM in
CSVInputFormat
Repository: carbondata
Updated Branches:
refs/heads/master 9f2884a04 -> 910f26171
[CARBONDATA-2234] Support UTF-8 with BOM in CSVInputFormat
This closes #2038
Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/910f2617
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/910f2617
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/910f2617
Branch: refs/heads/master
Commit: 910f26171750276be5ccfe404be9d8ab0f2ead42
Parents: 9f2884a
Author: KanakaKumar <ka...@huawei.com>
Authored: Mon Mar 5 16:58:18 2018 +0530
Committer: Venkata Ramana G <ra...@huawei.com>
Committed: Wed Mar 7 21:16:16 2018 +0530
----------------------------------------------------------------------
integration/presto/pom.xml | 5 ++++
.../loading/csvinput/CSVInputFormat.java | 6 +++-
.../loading/csvinput/CSVInputFormatTest.java | 30 ++++++++++++++++++-
.../src/test/resources/csv/csv_with_bom.csv | 3 ++
.../src/test/resources/csv/csv_with_bom.csv.bz2 | Bin 0 -> 129 bytes
.../src/test/resources/csv/csv_with_bom.csv.gz | Bin 0 -> 110 bytes
6 files changed, 42 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/integration/presto/pom.xml
----------------------------------------------------------------------
diff --git a/integration/presto/pom.xml b/integration/presto/pom.xml
index aaaf175..17f5d41 100644
--- a/integration/presto/pom.xml
+++ b/integration/presto/pom.xml
@@ -484,6 +484,11 @@
<artifactId>hk2-utils</artifactId>
<version>2.5.0-b42</version>
</dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.4</version>
+ </dependency>
</dependencies>
http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
----------------------------------------------------------------------
diff --git a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
index 259b6da..aebaf3b 100644
--- a/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
+++ b/processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java
@@ -29,6 +29,7 @@ import org.apache.carbondata.core.util.CarbonProperties;
import com.univocity.parsers.csv.CsvParser;
import com.univocity.parsers.csv.CsvParserSettings;
+import org.apache.commons.io.input.BOMInputStream;
import org.apache.commons.lang.BooleanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
@@ -271,8 +272,11 @@ public class CSVInputFormat extends FileInputFormat<NullWritable, StringArrayWri
filePosition = fileIn;
inputStream = boundedInputStream;
}
- reader = new InputStreamReader(inputStream,
+
+ //Wrap input stream with BOMInputStream to skip UTF-8 BOM characters
+ reader = new InputStreamReader(new BOMInputStream(inputStream),
Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET));
+
CsvParserSettings settings = extractCsvParserSettings(job);
if (start == 0) {
settings.setHeaderExtractionEnabled(job.getBoolean(HEADER_PRESENT,
http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
----------------------------------------------------------------------
diff --git a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
index 14c680e..d89f10d 100644
--- a/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
+++ b/processing/src/test/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormatTest.java
@@ -128,6 +128,7 @@ public class CSVInputFormatTest extends TestCase {
@Test public void testReadCSVFiles() throws Exception{
Configuration conf = new Configuration();
prepareConf(conf);
+ conf.setBoolean(CSVInputFormat.HEADER_PRESENT, true);
File output = new File("target/output_CSVInputFormatTest");
conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath());
Job job = Job.getInstance(conf, "CSVInputFormat_normal");
@@ -149,8 +150,35 @@ public class CSVInputFormatTest extends TestCase {
Assert.assertTrue(job.waitForCompletion(true));
}
+ /**
+ * test read csv files encoded as UTF-8 with BOM
+ * @throws Exception
+ */
+ @Test public void testReadCSVFilesWithBOM() throws Exception{
+
+ Configuration conf = new Configuration();
+ prepareConf(conf);
+ conf.setBoolean(CSVInputFormat.HEADER_PRESENT, false);
+ File output = new File("target/output_CSVInputFormatTest_bom");
+ conf.set("mapreduce.cluster.local.dir", output.getCanonicalPath());
+ Job job = Job.getInstance(conf, "CSVInputFormat_normal_bom");
+ job.setJarByClass(CSVInputFormatTest.class);
+ job.setMapperClass(CSVCheckMapper.class);
+ job.setNumReduceTasks(0);
+ job.setInputFormatClass(CSVInputFormat.class);
+
+ String inputFolder = new File("src/test/resources/csv").getCanonicalPath();
+ FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + "csv_with_bom.csv"));
+ FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + "csv_with_bom.csv.bz2"));
+ FileInputFormat.addInputPath(job, new Path(inputFolder + File.separator + "csv_with_bom.csv.gz"));
+
+ deleteOutput(output);
+ FileOutputFormat.setOutputPath(job, new Path(output.getCanonicalPath()));
+
+ Assert.assertTrue(job.waitForCompletion(true));
+ }
+
private void prepareConf(Configuration conf) {
- conf.setBoolean(CSVInputFormat.HEADER_PRESENT, true);
conf.set(CSVInputFormat.MAX_COLUMNS, "10");
conf.set(CSVInputFormat.NUMBER_OF_COLUMNS, "7");
}
http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/test/resources/csv/csv_with_bom.csv
----------------------------------------------------------------------
diff --git a/processing/src/test/resources/csv/csv_with_bom.csv b/processing/src/test/resources/csv/csv_with_bom.csv
new file mode 100644
index 0000000..ea4cfcc
--- /dev/null
+++ b/processing/src/test/resources/csv/csv_with_bom.csv
@@ -0,0 +1,3 @@
+1,2015/7/23,china,aaa1,phone197,ASD69643,15000
+2,2015/7/24,china,aaa2,phone756,ASD42892,15001
+3,2015/7/25,china,aaa3,phone1904,ASD37014,15002
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/test/resources/csv/csv_with_bom.csv.bz2
----------------------------------------------------------------------
diff --git a/processing/src/test/resources/csv/csv_with_bom.csv.bz2 b/processing/src/test/resources/csv/csv_with_bom.csv.bz2
new file mode 100644
index 0000000..21da5d5
Binary files /dev/null and b/processing/src/test/resources/csv/csv_with_bom.csv.bz2 differ
http://git-wip-us.apache.org/repos/asf/carbondata/blob/910f2617/processing/src/test/resources/csv/csv_with_bom.csv.gz
----------------------------------------------------------------------
diff --git a/processing/src/test/resources/csv/csv_with_bom.csv.gz b/processing/src/test/resources/csv/csv_with_bom.csv.gz
new file mode 100644
index 0000000..e3bd12e
Binary files /dev/null and b/processing/src/test/resources/csv/csv_with_bom.csv.gz differ