You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by dv...@apache.org on 2012/08/20 22:13:57 UTC
svn commit: r1375197 - in /pig/trunk: CHANGES.txt
contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java
contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
Author: dvryaboy
Date: Mon Aug 20 20:13:56 2012
New Revision: 1375197
URL: http://svn.apache.org/viewvc?rev=1375197&view=rev
Log:
PIG-2556: CSVExcelStorage load: quoted field with newline as first character sees newline as record end
Modified:
pig/trunk/CHANGES.txt
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1375197&r1=1375196&r2=1375197&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Mon Aug 20 20:13:56 2012
@@ -24,6 +24,8 @@ INCOMPATIBLE CHANGES
IMPROVEMENTS
+PIG-2556: CSVExcelStorage load: quoted field with newline as first character sees newline as record end (tivv via dvryaboy)
+
PIG-2875: Add recursive record support to AvroStorage (cheolsoo via sms)
PIG-2662: skew join does not honor its config parameters (rajesh.balamohan via thejas)
Modified: pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java?rev=1375197&r1=1375196&r2=1375197&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java (original)
+++ pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java Mon Aug 20 20:13:56 2012
@@ -478,7 +478,6 @@ public class CSVExcelStorage extends Pig
mProtoTuple.clear();
getNextInQuotedField = false;
evenQuotesSeen = true;
- sawEmbeddedRecordDelimiter = false;
getNextFieldID = 0;
recordLen = prevLineAndContinuation.length;
@@ -494,16 +493,14 @@ public class CSVExcelStorage extends Pig
recordLen = value.getLength();
}
- sawEmbeddedRecordDelimiter = false;
-
nextTupleSkipChar = false;
ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen);
- sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen,
- sawEmbeddedRecordDelimiter, buf, recordLen, fieldBuffer);
-
- // The last field is never delimited by a FIELD_DEL, but by
+ sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen,
+ buf, recordLen, fieldBuffer);
+
+ // The last field is never delimited by a FIELD_DEL, but by
// the end of the record. So we need to add that last field.
// The '!sawEmbeddedRecordDelimiter' handles the case of
// embedded newlines; we are amidst a field, not at
@@ -567,9 +564,9 @@ public class CSVExcelStorage extends Pig
* @param fieldBuffer
* @return
*/
- private boolean processOneInRecord(boolean evenQuotesSeen,
- boolean sawEmbeddedRecordDelimiter, byte[] buf, int recordLen,
- ByteBuffer fieldBuffer) {
+ private boolean processOneInRecord(boolean evenQuotesSeen,
+ byte[] buf, int recordLen,
+ ByteBuffer fieldBuffer) {
for (int i = 0; i < recordLen; i++) {
if (nextTupleSkipChar) {
nextTupleSkipChar = false;
@@ -588,17 +585,6 @@ public class CSVExcelStorage extends Pig
if (evenQuotesSeen) {
fieldBuffer.put(DOUBLE_QUOTE);
}
- } else if (i == recordLen - 1) {
- // This is the last char we read from the input stream,
- // but we have an open double quote.
- // We either have a run-away quoted field (i.e. a missing
- // closing field in the record), or we have a field with
- // a record delimiter in it. We assume the latter,
- // and cause the outer while loop to run again, reading
- // more from the stream. Write out the delimiter:
- fieldBuffer.put(b);
- sawEmbeddedRecordDelimiter = true;
- continue;
} else
if (!evenQuotesSeen &&
(b == FIELD_DEL || b == RECORD_DEL)) {
@@ -624,8 +610,8 @@ public class CSVExcelStorage extends Pig
evenQuotesSeen = true;
fieldBuffer.put(b);
}
- } // end for
- return sawEmbeddedRecordDelimiter && (multilineTreatment == Multiline.YES);
+ } // end for
+ return getNextInQuotedField && (multilineTreatment == Multiline.YES);
}
private void readField(ByteBuffer buf, int fieldID) {
Modified: pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java?rev=1375197&r1=1375196&r2=1375197&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java (original)
+++ pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java Mon Aug 20 20:13:56 2012
@@ -40,17 +40,17 @@ import org.junit.Test;
public class TestCSVExcelStorage {
protected static final Log LOG = LogFactory.getLog(TestCSVExcelStorage.class);
-
+
private PigServer pigServer;
//private MiniCluster cluster;
-
+
Properties props = new Properties();
ArrayList<String> testMsgs = new ArrayList<String>();
-
+
String testFileCommaName = "testFileComma.csv";
String testFileTabName = "testFileTab.csv";
- String testStrComma =
+ String testStrComma =
"John,Doe,10\n" +
"Jane, \"nee, Smith\",20\n" +
",,\n" +
@@ -62,16 +62,18 @@ public class TestCSVExcelStorage {
"do we\n" +
"handle that?\",Good,Fairy\n";
- String[] testStrCommaArray =
+ String[] testStrCommaArray =
new String[] {
"John,Doe,10",
"Jane, \"nee, Smith\",20",
",,",
"\"Mac \"\"the knife\"\"\",Cohen,30",
"\"Conrad\nEmil\",Dinger,40",
+ "Emil,\"\nDinger\",40",
+ "Quote problem,\"My \"\"famous\"\"\nsong\",60",
"1st Field,\"A poem that continues\nfor several lines\ndo we\nhandle that?\",Good,Fairy",
};
-
+
@SuppressWarnings("serial")
ArrayList<Tuple> testStrCommaYesMultilineResultTuples =
new ArrayList<Tuple>() {
@@ -81,10 +83,12 @@ public class TestCSVExcelStorage {
add(Util.createTuple(new String[] {"", "", ""}));
add(Util.createTuple(new String[] {"Mac \"the knife\"", "Cohen", "30"}));
add(Util.createTuple(new String[] {"Conrad\nEmil", "Dinger", "40"}));
+ add(Util.createTuple(new String[] {"Emil", "\nDinger", "40"}));
+ add(Util.createTuple(new String[] {"Quote problem", "My \"famous\"\nsong", "60"}));
add(Util.createTuple(new String[] {"1st Field", "A poem that continues\nfor several lines\ndo we\nhandle that?", "Good", "Fairy"}));
}
};
-
+
@SuppressWarnings("serial")
ArrayList<Tuple> testStrCommaNoMultilineResultTuples =
new ArrayList<Tuple>() {
@@ -95,26 +99,30 @@ public class TestCSVExcelStorage {
add(Util.createTuple(new String[] {"Mac \"the knife\"", "Cohen", "30"}));
add(Util.createTuple(new String[] {"Conrad"}));
add(Util.createTuple(new String[] {"Emil,Dinger,40"})); // Trailing double quote after Emil eats rest of line
+ add(Util.createTuple(new String[] {"Emil"}));
+ add(Util.createTuple(new String[] {"Dinger,40"})); // Trailing double quote after Emil eats rest of line
+ add(Util.createTuple(new String[] {"Quote problem", "My \"famous\""}));
+ add(Util.createTuple(new String[] {"song,60"}));
add(Util.createTuple(new String[] {"1st Field", "A poem that continues"}));
add(Util.createTuple(new String[] {"for several lines"}));
add(Util.createTuple(new String[] {"do we"}));
add(Util.createTuple(new String[] {"handle that?,Good,Fairy"})); // Trailing double quote eats rest of line
}
};
-
- String testStrTab =
+
+ String testStrTab =
"John\tDoe\t50\n" +
"\"Foo and CR last\n" +
"bar.\"\t\t\n" +
"Frank\tClean\t70";
- String[] testStrTabArray =
+ String[] testStrTabArray =
new String[] {
"John\tDoe\t50",
"\"Foo and CR last\nbar.\"\t\t",
"Frank\tClean\t70"
};
-
+
@SuppressWarnings("serial")
ArrayList<Tuple> testStrTabYesMultilineResultTuples =
new ArrayList<Tuple>() {
@@ -124,7 +132,7 @@ public class TestCSVExcelStorage {
add(Util.createTuple(new String[] {"Frank","Clean","70"}));
}
};
-
+
public TestCSVExcelStorage() throws ExecException, IOException {
pigServer = new PigServer(ExecType.LOCAL);
@@ -132,7 +140,9 @@ public class TestCSVExcelStorage {
.setProperty("mapred.map.max.attempts", "1");
pigServer.getPigContext().getProperties()
.setProperty("mapred.reduce.max.attempts", "1");
-
+ pigServer.getPigContext().getProperties()
+ .setProperty("mapreduce.job.end-notification.retry.interval", "100");
+
Util.createLocalInputFile(testFileCommaName, testStrCommaArray);
Util.createLocalInputFile(testFileTabName, testStrTabArray);
}
@@ -147,8 +157,8 @@ public class TestCSVExcelStorage {
Iterator<Tuple> it = pigServer.openIterator("a");
assertEquals(Util.createTuple(new String[] {"foo", "bar", "baz"}), it.next());
}
-
- @Test
+
+ @Test
public void testQuotedCommas() throws IOException {
String inputFileName = "TestCSVExcelStorage-quotedcommas.txt";
Util.createLocalInputFile(inputFileName, new String[] {"\"foo,bar,baz\"", "fee,foe,fum"});
@@ -159,11 +169,11 @@ public class TestCSVExcelStorage {
assertEquals(Util.createTuple(new String[] {"foo,bar,baz", null, null}), it.next());
assertEquals(Util.createTuple(new String[] {"fee", "foe", "fum"}), it.next());
}
-
+
@Test
public void testQuotedQuotes() throws IOException {
String inputFileName = "TestCSVExcelStorage-quotedquotes.txt";
- Util.createLocalInputFile(inputFileName,
+ Util.createLocalInputFile(inputFileName,
new String[] {"\"foo,\"\"bar\"\",baz\"", "\"\"\"\"\"\"\"\""});
String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " +
" as (a:chararray); ";
@@ -176,26 +186,26 @@ public class TestCSVExcelStorage {
@Test
public void testMultiline() throws IOException {
// Read the test file:
- String script =
- "a = LOAD '" + testFileCommaName + "' " +
+ String script =
+ "a = LOAD '" + testFileCommaName + "' " +
"USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');";
Util.registerMultiLineQuery(pigServer, script);
compareExpectedActual(testStrCommaYesMultilineResultTuples, "a");
-
+
// Store the test file back down into another file using YES_MULTILINE:
String testOutFileName = createOutputFileName();
script = "STORE a INTO '" + testOutFileName + "' USING " +
"org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');";
pigServer.registerQuery(script);
-
+
// Read it back out using YES_MULTILINE, and see whether it's still correct:
- script = "b = LOAD '" + testOutFileName + "' " +
+ script = "b = LOAD '" + testOutFileName + "' " +
"USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');";
Util.registerMultiLineQuery(pigServer, script);
compareExpectedActual(testStrCommaYesMultilineResultTuples, "b");
-
+
// Now read it back again, but multilines turned off:
- script = "c = LOAD '" + testOutFileName + "' " +
+ script = "c = LOAD '" + testOutFileName + "' " +
"USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');";
Util.registerMultiLineQuery(pigServer, script);
compareExpectedActual(testStrCommaNoMultilineResultTuples, "c");
@@ -205,32 +215,32 @@ public class TestCSVExcelStorage {
script = "STORE c INTO '" + testOutFileName + "' USING " +
"org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');";
pigServer.registerQuery(script);
-
+
// Read it back in, again with NO_MULTILINE and see whether it's still correct:
- script = "d = LOAD '" + testOutFileName + "' " +
+ script = "d = LOAD '" + testOutFileName + "' " +
"USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');";
Util.registerMultiLineQuery(pigServer, script);
compareExpectedActual(testStrCommaNoMultilineResultTuples, "d");
}
-
+
@Test
public void testTabDelimiter() throws IOException {
// Read the test file:
- String script =
- "e = LOAD '" + testFileTabName + "' " +
+ String script =
+ "e = LOAD '" + testFileTabName + "' " +
"USING org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
Util.registerMultiLineQuery(pigServer, script);
compareExpectedActual(testStrTabYesMultilineResultTuples, "e");
-
+
// Store the test file back down into another file using YES_MULTILINE:
String testOutFileName = createOutputFileName();
script = "STORE e INTO '" + testOutFileName + "' USING " +
"org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
pigServer.registerQuery(script);
-
+
// Read it back out using YES_MULTILINE, and see whether it's still correct:
- script = "f = LOAD '" + testOutFileName + "' " +
+ script = "f = LOAD '" + testOutFileName + "' " +
"USING org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
Util.registerMultiLineQuery(pigServer, script);
compareExpectedActual(testStrTabYesMultilineResultTuples, "f");
@@ -239,13 +249,13 @@ public class TestCSVExcelStorage {
private void compareExpectedActual(ArrayList<Tuple> theExpected, String theActualPigVarAlias) throws IOException {
Iterator<Tuple> actualIt = pigServer.openIterator(theActualPigVarAlias);
Iterator<Tuple> expIt = theExpected.iterator();
-
+
while (actualIt.hasNext()) {
Tuple actual = actualIt.next();
if (!expIt.hasNext())
Assert.fail("The input contains more records than expected. First unexpected record: " + actual);
Tuple expected = expIt.next();
- // The following assert does not work, even if
+ // The following assert does not work, even if
// the two tuples are identical in class (BinSedesTuple)
// and content. We need to compare element by element:
//assertEquals(expected, actual);
@@ -256,24 +266,24 @@ public class TestCSVExcelStorage {
}
}
}
-
+
/*
* Hack to get a temp file name to store data into.
* The file must not exist when the caller subsequently
- * tries to write to it. In non-testing code this
+ * tries to write to it. In non-testing code this
* would be an intolerable race condition. There's
- * likely a better way.
+ * likely a better way.
*/
private String createOutputFileName() throws IOException {
File f = File.createTempFile("CSVExcelStorageTest", "csv");
f.deleteOnExit();
f.delete();
- // On Windows this path will be C:\\..., which
- // causes errors in the Hadoop environment. Replace
+ // On Windows this path will be C:\\..., which
+ // causes errors in the Hadoop environment. Replace
// the backslashes with forward slashes:
return f.getAbsolutePath().replaceAll("\\\\", "/");
- }
-
+ }
+
public static void main(String[] args) {
TestCSVExcelStorage tester = null;
try {
@@ -282,14 +292,14 @@ public class TestCSVExcelStorage {
tester.testQuotedCommas();
tester.testQuotedQuotes();
tester.testMultiline();
- tester.testTabDelimiter();
+ tester.testTabDelimiter();
System.out.println("CSVExcelStorage() passed all tests.");
-
+
} catch (ExecException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
-
+
}