You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by ma...@apache.org on 2021/03/02 21:38:50 UTC
[incubator-pinot] branch master updated: Allow escaping comma
characters in CSV files. (#6627)
This is an automated email from the ASF dual-hosted git repository.
mayanks pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 3cef97a Allow escaping comma characters in CSV files. (#6627)
3cef97a is described below
commit 3cef97a7743550077c33f27ceae140d445adf34b
Author: Amrish Lal <am...@gmail.com>
AuthorDate: Tue Mar 2 13:38:38 2021 -0800
Allow escaping comma characters in CSV files. (#6627)
* Allow escaping comma characters in CSV files.
* Allow escaping comma characters in CSV files.
* Set backslash as default escape character for parsing CSV files.
* Setting default escape character to null to avoid test case failures.
---
.../plugin/inputformat/csv/CSVRecordReader.java | 1 +
.../inputformat/csv/CSVRecordReaderConfig.java | 12 ++++++--
.../inputformat/csv/CSVRecordExtractorTest.java | 36 ++++++++++++++++++++++
3 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
index 48c9b00..fc14272 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
@@ -89,6 +89,7 @@ public class CSVRecordReader implements RecordReader {
}
Character commentMarker = config.getCommentMarker();
format = format.withCommentMarker(commentMarker);
+ format = format.withEscape(config.getEscapeCharacter());
_format = format;
multiValueDelimiter = config.getMultiValueDelimiter();
}
diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderConfig.java b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderConfig.java
index b30c09f..448de85 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderConfig.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderConfig.java
@@ -32,8 +32,8 @@ public class CSVRecordReaderConfig implements RecordReaderConfig {
private String _header;
private char _delimiter = DEFAULT_DELIMITER;
private char _multiValueDelimiter = DEFAULT_MULTI_VALUE_DELIMITER;
-
- private Character _commentMarker; // Default is null
+ private Character _commentMarker; // Default is null
+ private Character _escapeCharacter; // Default is null
public String getFileFormat() {
return _fileFormat;
@@ -75,6 +75,14 @@ public class CSVRecordReaderConfig implements RecordReaderConfig {
_commentMarker = commentMarker;
}
+ public Character getEscapeCharacter() {
+ return _escapeCharacter;
+ }
+
+ public void setEscapeCharacter(Character escapeCharacter) {
+ _escapeCharacter = escapeCharacter;
+ }
+
@Override
public String toString() {
return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
index 7432d5b..93994c9 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
@@ -18,10 +18,12 @@
*/
package org.apache.pinot.plugin.inputformat.csv;
+import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Collection;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -32,6 +34,7 @@ import org.apache.pinot.spi.data.readers.AbstractRecordExtractorTest;
import org.apache.pinot.spi.data.readers.GenericRow;
import org.apache.pinot.spi.data.readers.RecordReader;
import org.testng.Assert;
+import org.testng.annotations.Test;
/**
@@ -101,4 +104,37 @@ public class CSVRecordExtractorTest extends AbstractRecordExtractorTest {
}
}
}
+
+ /**
+ * Check if we can parse a CSV file that has escaped comma characters within fields.
+ */
+ @Test
+ public void testEscapeCharacterInCSV()
+ throws Exception {
+ // Create CSV config with backslash as escape character.
+ CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig();
+ csvRecordReaderConfig.setEscapeCharacter('\\');
+
+ // Create a CSV file where records have two values and the second value contains an escaped comma.
+ File escapedFile = new File(_tempDir, "escape.csv");
+ BufferedWriter writer = new BufferedWriter(new FileWriter(escapedFile));
+ writer.write("first,second\n");
+ writer.write("string1, string2\\, string3");
+ writer.close();
+
+ // Try to parse CSV file with escaped comma.
+ CSVRecordReader csvRecordReader = new CSVRecordReader();
+ HashSet<String> fieldsToRead = new HashSet<>();
+ fieldsToRead.add("first");
+ fieldsToRead.add("second");
+ csvRecordReader.init(escapedFile, fieldsToRead, csvRecordReaderConfig);
+ GenericRow genericRow = new GenericRow();
+ csvRecordReader.rewind();
+
+ // check if parsing succeeded.
+ Assert.assertTrue(csvRecordReader.hasNext());
+ csvRecordReader.next(genericRow);
+ Assert.assertEquals(genericRow.getValue("first"), "string1");
+ Assert.assertEquals(genericRow.getValue("second"), " string2, string3");
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org