You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/06/10 18:27:05 UTC

[nutch] branch master updated: NUTCH-2790 indexer-csv: escape field leading quote character

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 6fa02ef  NUTCH-2790 indexer-csv: escape field leading quote character
     new 3feaf03  Merge pull request #532 from pmezard/NUTCH-2790
6fa02ef is described below

commit 6fa02ef511a1375e765f572f6c9eecb86c96fbc5
Author: Patrick Mezard <pa...@mezard.eu>
AuthorDate: Tue Jun 9 17:00:16 2020 +0200

    NUTCH-2790 indexer-csv: escape field leading quote character
    
    Before the change, the leading quote of a field value like '"value'
    would be left unescaped.
---
 .../java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java    | 3 +--
 .../org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java     | 9 +++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
index 160d03d..99c0702 100644
--- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
@@ -405,13 +405,12 @@ public class CSVIndexWriter implements IndexWriter {
     if (max > maxFieldLength) {
       max = maxFieldLength;
     }
-    while (nextQuoteChar > 0 && nextQuoteChar < max) {
+    while (nextQuoteChar >= 0 && nextQuoteChar < max) {
       csvout.write(value.substring(start, nextQuoteChar).getBytes(encoding));
       csvout.write(escapeCharacter.bytes);
       csvout.write(quoteCharacter.bytes);
       start = nextQuoteChar + 1;
       nextQuoteChar = quoteCharacter.find(value, start);
-      if (nextQuoteChar > max) break;
     }
     csvout.write(value.substring(start, max).getBytes(encoding));
   }
diff --git a/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java b/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
index 761d042..5714cc2 100644
--- a/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
@@ -159,6 +159,15 @@ public class TestCSVIndexWriter {
   }
 
   @Test
+  public void testCSVescapeLeadingQuotes() throws IOException {
+    String[] params = { CSVConstants.CSV_FIELDS, "test" };
+    String[] fields = { "test", "\"quote\"" };
+    String csv = getCSV(params, fields);
+    assertEquals("Leading quotes inside a quoted field must be escaped",
+        "\"\"\"quote\"\"\"", csv.trim());
+  }
+
+  @Test
   public void testCSVclipMaxLength() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "test",
         CSVConstants.CSV_MAXFIELDLENGTH, "8" };