You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by si...@apache.org on 2022/07/07 22:01:21 UTC

[pinot] branch master updated: [8835] Fix for CSV files surrounding space (#9028)

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new c1dea5951f [8835] Fix for CSV files surrounding space (#9028)
c1dea5951f is described below

commit c1dea5951fd513d77b479fcd44c77be81f757ec9
Author: Ravishankar <ra...@gmail.com>
AuthorDate: Fri Jul 8 03:31:13 2022 +0530

    [8835] Fix for CSV files surrounding space (#9028)
---
 .../plugin/inputformat/csv/CSVRecordReader.java    |  2 +-
 .../inputformat/csv/CSVRecordExtractorTest.java    | 53 +++++++++++++++++++++-
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
index 471dee9ea4..a95ee47036 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
@@ -80,7 +80,7 @@ public class CSVRecordReader implements RecordReader {
         }
       }
       char delimiter = config.getDelimiter();
-      format = format.withDelimiter(delimiter);
+      format = format.withDelimiter(delimiter).withIgnoreSurroundingSpaces(true);
       String csvHeader = config.getHeader();
       if (csvHeader == null) {
         format = format.withHeader();
diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
index d23fbd0dd7..4dea94f398 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
@@ -105,6 +105,57 @@ public class CSVRecordExtractorTest extends AbstractRecordExtractorTest {
     }
   }
 
+  @Test
+  public void testRemovingSurroundingSpaces() throws IOException {
+    CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig();
+
+    // Create a CSV file where records have two values and the second value contains an extra space.
+    File spaceFile = new File(_tempDir, "space.csv");
+    BufferedWriter writer = new BufferedWriter(new FileWriter(spaceFile));
+    writer.write("col1 ,col2\n");
+    writer.write(" value11, value12");
+    writer.close();
+
+    CSVRecordReader csvRecordReader = new CSVRecordReader();
+    HashSet<String> fieldsToRead = new HashSet<>();
+    fieldsToRead.add("col1");
+    fieldsToRead.add("col2");
+    csvRecordReader.init(spaceFile, fieldsToRead, csvRecordReaderConfig);
+    GenericRow genericRow = new GenericRow();
+    csvRecordReader.rewind();
+
+    // check if parsing succeeded.
+    Assert.assertTrue(csvRecordReader.hasNext());
+    csvRecordReader.next(genericRow);
+    Assert.assertEquals(genericRow.getValue("col1"), "value11");
+    Assert.assertEquals(genericRow.getValue("col2"), "value12");
+  }
+
+  @Test
+  public void testIgnoringSurroundingSpaces() throws IOException {
+    CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig();
+
+    // Create a CSV file where records have two values and the second value contains an extra space.
+    File spaceFile = new File(_tempDir, "space.csv");
+    BufferedWriter writer = new BufferedWriter(new FileWriter(spaceFile));
+    writer.write("col1 ,col2\n");
+    writer.write("\"value11\",\" value12\"");
+    writer.close();
+
+    CSVRecordReader csvRecordReader = new CSVRecordReader();
+    HashSet<String> fieldsToRead = new HashSet<>();
+    fieldsToRead.add("col1");
+    fieldsToRead.add("col2");
+    csvRecordReader.init(spaceFile, fieldsToRead, csvRecordReaderConfig);
+    GenericRow genericRow = new GenericRow();
+    csvRecordReader.rewind();
+
+    // check if parsing succeeded.
+    Assert.assertTrue(csvRecordReader.hasNext());
+    csvRecordReader.next(genericRow);
+    Assert.assertEquals(genericRow.getValue("col1"), "value11");
+    Assert.assertEquals(genericRow.getValue("col2"), " value12");
+  }
   /**
    * Check if we can parse a CSV file that has escaped comma characters within fields.
    */
@@ -135,6 +186,6 @@ public class CSVRecordExtractorTest extends AbstractRecordExtractorTest {
     Assert.assertTrue(csvRecordReader.hasNext());
     csvRecordReader.next(genericRow);
     Assert.assertEquals(genericRow.getValue("first"), "string1");
-    Assert.assertEquals(genericRow.getValue("second"), " string2, string3");
+    Assert.assertEquals(genericRow.getValue("second"), "string2, string3");
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org