You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/06/10 18:27:05 UTC
[nutch] branch master updated: NUTCH-2790 indexer-csv: escape field
leading quote character
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 6fa02ef NUTCH-2790 indexer-csv: escape field leading quote character
new 3feaf03 Merge pull request #532 from pmezard/NUTCH-2790
6fa02ef is described below
commit 6fa02ef511a1375e765f572f6c9eecb86c96fbc5
Author: Patrick Mezard <pa...@mezard.eu>
AuthorDate: Tue Jun 9 17:00:16 2020 +0200
NUTCH-2790 indexer-csv: escape field leading quote character
Before the change, the leading quote of a field value like '"value'
would be left unescaped.
---
.../java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java | 3 +--
.../org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java | 9 +++++++++
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
index 160d03d..99c0702 100644
--- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
@@ -405,13 +405,12 @@ public class CSVIndexWriter implements IndexWriter {
if (max > maxFieldLength) {
max = maxFieldLength;
}
- while (nextQuoteChar > 0 && nextQuoteChar < max) {
+ while (nextQuoteChar >= 0 && nextQuoteChar < max) {
csvout.write(value.substring(start, nextQuoteChar).getBytes(encoding));
csvout.write(escapeCharacter.bytes);
csvout.write(quoteCharacter.bytes);
start = nextQuoteChar + 1;
nextQuoteChar = quoteCharacter.find(value, start);
- if (nextQuoteChar > max) break;
}
csvout.write(value.substring(start, max).getBytes(encoding));
}
diff --git a/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java b/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
index 761d042..5714cc2 100644
--- a/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
@@ -159,6 +159,15 @@ public class TestCSVIndexWriter {
}
@Test
+ public void testCSVescapeLeadingQuotes() throws IOException {
+ String[] params = { CSVConstants.CSV_FIELDS, "test" };
+ String[] fields = { "test", "\"quote\"" };
+ String csv = getCSV(params, fields);
+ assertEquals("Leading quotes inside a quoted field must be escaped",
+ "\"\"\"quote\"\"\"", csv.trim());
+ }
+
+ @Test
public void testCSVclipMaxLength() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "test",
CSVConstants.CSV_MAXFIELDLENGTH, "8" };