You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by si...@apache.org on 2022/07/07 22:01:21 UTC
[pinot] branch master updated: [8835] Fix for CSV files surrounding space (#9028)
This is an automated email from the ASF dual-hosted git repository.
siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new c1dea5951f [8835] Fix for CSV files surrounding space (#9028)
c1dea5951f is described below
commit c1dea5951fd513d77b479fcd44c77be81f757ec9
Author: Ravishankar <ra...@gmail.com>
AuthorDate: Fri Jul 8 03:31:13 2022 +0530
[8835] Fix for CSV files surrounding space (#9028)
---
.../plugin/inputformat/csv/CSVRecordReader.java | 2 +-
.../inputformat/csv/CSVRecordExtractorTest.java | 53 +++++++++++++++++++++-
2 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
index 471dee9ea4..a95ee47036 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
@@ -80,7 +80,7 @@ public class CSVRecordReader implements RecordReader {
}
}
char delimiter = config.getDelimiter();
- format = format.withDelimiter(delimiter);
+ format = format.withDelimiter(delimiter).withIgnoreSurroundingSpaces(true);
String csvHeader = config.getHeader();
if (csvHeader == null) {
format = format.withHeader();
diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
index d23fbd0dd7..4dea94f398 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
@@ -105,6 +105,57 @@ public class CSVRecordExtractorTest extends AbstractRecordExtractorTest {
}
}
+ @Test
+ public void testRemovingSurroundingSpaces() throws IOException {
+ CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig();
+
+ // Create a CSV file where records have two values and the second value contains an extra space.
+ File spaceFile = new File(_tempDir, "space.csv");
+ BufferedWriter writer = new BufferedWriter(new FileWriter(spaceFile));
+ writer.write("col1 ,col2\n");
+ writer.write(" value11, value12");
+ writer.close();
+
+ CSVRecordReader csvRecordReader = new CSVRecordReader();
+ HashSet<String> fieldsToRead = new HashSet<>();
+ fieldsToRead.add("col1");
+ fieldsToRead.add("col2");
+ csvRecordReader.init(spaceFile, fieldsToRead, csvRecordReaderConfig);
+ GenericRow genericRow = new GenericRow();
+ csvRecordReader.rewind();
+
+ // check if parsing succeeded.
+ Assert.assertTrue(csvRecordReader.hasNext());
+ csvRecordReader.next(genericRow);
+ Assert.assertEquals(genericRow.getValue("col1"), "value11");
+ Assert.assertEquals(genericRow.getValue("col2"), "value12");
+ }
+
+ @Test
+ public void testIgnoringSurroundingSpaces() throws IOException {
+ CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig();
+
+ // Create a CSV file where records have two values and the second value contains an extra space.
+ File spaceFile = new File(_tempDir, "space.csv");
+ BufferedWriter writer = new BufferedWriter(new FileWriter(spaceFile));
+ writer.write("col1 ,col2\n");
+ writer.write("\"value11\",\" value12\"");
+ writer.close();
+
+ CSVRecordReader csvRecordReader = new CSVRecordReader();
+ HashSet<String> fieldsToRead = new HashSet<>();
+ fieldsToRead.add("col1");
+ fieldsToRead.add("col2");
+ csvRecordReader.init(spaceFile, fieldsToRead, csvRecordReaderConfig);
+ GenericRow genericRow = new GenericRow();
+ csvRecordReader.rewind();
+
+ // check if parsing succeeded.
+ Assert.assertTrue(csvRecordReader.hasNext());
+ csvRecordReader.next(genericRow);
+ Assert.assertEquals(genericRow.getValue("col1"), "value11");
+ Assert.assertEquals(genericRow.getValue("col2"), " value12");
+ }
/**
* Check if we can parse a CSV file that has escaped comma characters within fields.
*/
@@ -135,6 +186,6 @@ public class CSVRecordExtractorTest extends AbstractRecordExtractorTest {
Assert.assertTrue(csvRecordReader.hasNext());
csvRecordReader.next(genericRow);
Assert.assertEquals(genericRow.getValue("first"), "string1");
- Assert.assertEquals(genericRow.getValue("second"), " string2, string3");
+ Assert.assertEquals(genericRow.getValue("second"), "string2, string3");
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org