You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by ma...@apache.org on 2021/03/02 21:38:50 UTC

[incubator-pinot] branch master updated: Allow escaping comma characters in CSV files. (#6627)

This is an automated email from the ASF dual-hosted git repository.

mayanks pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 3cef97a  Allow escaping comma characters in CSV files. (#6627)
3cef97a is described below

commit 3cef97a7743550077c33f27ceae140d445adf34b
Author: Amrish Lal <am...@gmail.com>
AuthorDate: Tue Mar 2 13:38:38 2021 -0800

    Allow escaping comma characters in CSV files. (#6627)
    
    * Allow escaping comma characters in CSV files.
    
    * Allow escaping comma characters in CSV files.
    
    * Set backslash as default escape character for parsing CSV files.
    
    * Setting default escape character to null to avoid test case failures.
---
 .../plugin/inputformat/csv/CSVRecordReader.java    |  1 +
 .../inputformat/csv/CSVRecordReaderConfig.java     | 12 ++++++--
 .../inputformat/csv/CSVRecordExtractorTest.java    | 36 ++++++++++++++++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
index 48c9b00..fc14272 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
@@ -89,6 +89,7 @@ public class CSVRecordReader implements RecordReader {
       }
       Character commentMarker = config.getCommentMarker();
       format = format.withCommentMarker(commentMarker);
+      format = format.withEscape(config.getEscapeCharacter());
       _format = format;
       multiValueDelimiter = config.getMultiValueDelimiter();
     }
diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderConfig.java b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderConfig.java
index b30c09f..448de85 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderConfig.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderConfig.java
@@ -32,8 +32,8 @@ public class CSVRecordReaderConfig implements RecordReaderConfig {
   private String _header;
   private char _delimiter = DEFAULT_DELIMITER;
   private char _multiValueDelimiter = DEFAULT_MULTI_VALUE_DELIMITER;
-
-  private Character _commentMarker;  // Default is null
+  private Character _commentMarker;   // Default is null
+  private Character _escapeCharacter; // Default is null
 
   public String getFileFormat() {
     return _fileFormat;
@@ -75,6 +75,14 @@ public class CSVRecordReaderConfig implements RecordReaderConfig {
     _commentMarker = commentMarker;
   }
 
+  public Character getEscapeCharacter() {
+    return _escapeCharacter;
+  }
+
+  public void setEscapeCharacter(Character escapeCharacter) {
+    _escapeCharacter = escapeCharacter;
+  }
+
   @Override
   public String toString() {
     return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
index 7432d5b..93994c9 100644
--- a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
+++ b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java
@@ -18,10 +18,12 @@
  */
 package org.apache.pinot.plugin.inputformat.csv;
 
+import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -32,6 +34,7 @@ import org.apache.pinot.spi.data.readers.AbstractRecordExtractorTest;
 import org.apache.pinot.spi.data.readers.GenericRow;
 import org.apache.pinot.spi.data.readers.RecordReader;
 import org.testng.Assert;
+import org.testng.annotations.Test;
 
 
 /**
@@ -101,4 +104,37 @@ public class CSVRecordExtractorTest extends AbstractRecordExtractorTest {
       }
     }
   }
+
+  /**
+   * Check if we can parse a CSV file that has escaped comma characters within fields.
+   */
+  @Test
+  public void testEscapeCharacterInCSV()
+    throws Exception {
+    // Create CSV config with backslash as escape character.
+    CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig();
+    csvRecordReaderConfig.setEscapeCharacter('\\');
+
+    // Create a CSV file where records have two values and the second value contains an escaped comma.
+    File escapedFile = new File(_tempDir, "escape.csv");
+    BufferedWriter writer = new BufferedWriter(new FileWriter(escapedFile));
+    writer.write("first,second\n");
+    writer.write("string1, string2\\, string3");
+    writer.close();
+
+    // Try to parse CSV file with escaped comma.
+    CSVRecordReader csvRecordReader = new CSVRecordReader();
+    HashSet<String> fieldsToRead = new HashSet<>();
+    fieldsToRead.add("first");
+    fieldsToRead.add("second");
+    csvRecordReader.init(escapedFile, fieldsToRead, csvRecordReaderConfig);
+    GenericRow genericRow = new GenericRow();
+    csvRecordReader.rewind();
+
+    // check if parsing succeeded.
+    Assert.assertTrue(csvRecordReader.hasNext());
+    csvRecordReader.next(genericRow);
+    Assert.assertEquals(genericRow.getValue("first"), "string1");
+    Assert.assertEquals(genericRow.getValue("second"), " string2, string3");
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org