You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ch...@apache.org on 2013/03/26 05:38:01 UTC
svn commit: r1460972 [2/2] - in /pig/trunk: CHANGES.txt
contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java
contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
Modified: pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java?rev=1460972&r1=1460971&r2=1460972&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java (original)
+++ pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java Tue Mar 26 04:38:01 2013
@@ -1,305 +1,428 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.pig.piggybank.test.storage;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.Properties;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.pig.ExecType;
-import org.apache.pig.PigServer;
-import org.apache.pig.backend.executionengine.ExecException;
-import org.apache.pig.data.DataByteArray;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.test.Util;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestCSVExcelStorage {
-
- protected static final Log LOG = LogFactory.getLog(TestCSVExcelStorage.class);
-
- private PigServer pigServer;
- //private MiniCluster cluster;
-
- Properties props = new Properties();
- ArrayList<String> testMsgs = new ArrayList<String>();
-
- String testFileCommaName = "testFileComma.csv";
- String testFileTabName = "testFileTab.csv";
-
- String testStrComma =
- "John,Doe,10\n" +
- "Jane, \"nee, Smith\",20\n" +
- ",,\n" +
- "\"Mac \"\"the knife\"\"\",Cohen,30\n" +
- "\"Conrad\n" +
- "Emil\",Dinger,40\n" +
- "1st Field,\"A poem that continues\n" +
- "for several lines\n" +
- "do we\n" +
- "handle that?\",Good,Fairy\n";
-
- String[] testStrCommaArray =
- new String[] {
- "John,Doe,10",
- "Jane, \"nee, Smith\",20",
- ",,",
- "\"Mac \"\"the knife\"\"\",Cohen,30",
- "\"Conrad\nEmil\",Dinger,40",
- "Emil,\"\nDinger\",40",
- "Quote problem,\"My \"\"famous\"\"\nsong\",60",
- "1st Field,\"A poem that continues\nfor several lines\ndo we\nhandle that?\",Good,Fairy",
- };
-
- @SuppressWarnings("serial")
- ArrayList<Tuple> testStrCommaYesMultilineResultTuples =
- new ArrayList<Tuple>() {
- {
- add(Util.createTuple(new String[] {"John","Doe","10"}));
- add(Util.createTuple(new String[] {"Jane", " nee, Smith","20"}));
- add(Util.createTuple(new String[] {"", "", ""}));
- add(Util.createTuple(new String[] {"Mac \"the knife\"", "Cohen", "30"}));
- add(Util.createTuple(new String[] {"Conrad\nEmil", "Dinger", "40"}));
- add(Util.createTuple(new String[] {"Emil", "\nDinger", "40"}));
- add(Util.createTuple(new String[] {"Quote problem", "My \"famous\"\nsong", "60"}));
- add(Util.createTuple(new String[] {"1st Field", "A poem that continues\nfor several lines\ndo we\nhandle that?", "Good", "Fairy"}));
- }
- };
-
- @SuppressWarnings("serial")
- ArrayList<Tuple> testStrCommaNoMultilineResultTuples =
- new ArrayList<Tuple>() {
- {
- add(Util.createTuple(new String[] {"John","Doe","10"}));
- add(Util.createTuple(new String[] {"Jane", " nee, Smith","20"}));
- add(Util.createTuple(new String[] {"", "", ""}));
- add(Util.createTuple(new String[] {"Mac \"the knife\"", "Cohen", "30"}));
- add(Util.createTuple(new String[] {"Conrad"}));
- add(Util.createTuple(new String[] {"Emil,Dinger,40"})); // Trailing double quote after Emil eats rest of line
- add(Util.createTuple(new String[] {"Emil"}));
- add(Util.createTuple(new String[] {"Dinger,40"})); // Trailing double quote after Emil eats rest of line
- add(Util.createTuple(new String[] {"Quote problem", "My \"famous\""}));
- add(Util.createTuple(new String[] {"song,60"}));
- add(Util.createTuple(new String[] {"1st Field", "A poem that continues"}));
- add(Util.createTuple(new String[] {"for several lines"}));
- add(Util.createTuple(new String[] {"do we"}));
- add(Util.createTuple(new String[] {"handle that?,Good,Fairy"})); // Trailing double quote eats rest of line
- }
- };
-
- String testStrTab =
- "John\tDoe\t50\n" +
- "\"Foo and CR last\n" +
- "bar.\"\t\t\n" +
- "Frank\tClean\t70";
-
- String[] testStrTabArray =
- new String[] {
- "John\tDoe\t50",
- "\"Foo and CR last\nbar.\"\t\t",
- "Frank\tClean\t70"
- };
-
- @SuppressWarnings("serial")
- ArrayList<Tuple> testStrTabYesMultilineResultTuples =
- new ArrayList<Tuple>() {
- {
- add(Util.createTuple(new String[] {"John","Doe","50"}));
- add(Util.createTuple(new String[] {"Foo and CR last\nbar.","",""}));
- add(Util.createTuple(new String[] {"Frank","Clean","70"}));
- }
- };
-
- public TestCSVExcelStorage() throws ExecException, IOException {
-
- pigServer = new PigServer(ExecType.LOCAL);
- pigServer.getPigContext().getProperties()
- .setProperty("mapred.map.max.attempts", "1");
- pigServer.getPigContext().getProperties()
- .setProperty("mapred.reduce.max.attempts", "1");
- pigServer.getPigContext().getProperties()
- .setProperty("mapreduce.job.end-notification.retry.interval", "100");
-
- Util.createLocalInputFile(testFileCommaName, testStrCommaArray);
- Util.createLocalInputFile(testFileTabName, testStrTabArray);
- }
-
- @Test
- public void testSimpleCsv() throws IOException {
- String inputFileName = "TestCSVExcelStorage-simple.txt";
- Util.createLocalInputFile(inputFileName, new String[] {"foo,bar,baz", "fee,foe,fum"});
- String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " +
- " as (a:chararray, b:chararray, c:chararray); ";
- Util.registerMultiLineQuery(pigServer, script);
- Iterator<Tuple> it = pigServer.openIterator("a");
- assertEquals(Util.createTuple(new String[] {"foo", "bar", "baz"}), it.next());
- }
-
- @Test
- public void testQuotedCommas() throws IOException {
- String inputFileName = "TestCSVExcelStorage-quotedcommas.txt";
- Util.createLocalInputFile(inputFileName, new String[] {"\"foo,bar,baz\"", "fee,foe,fum"});
- String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " +
- " as (a:chararray, b:chararray, c:chararray); ";
- Util.registerMultiLineQuery(pigServer, script);
- Iterator<Tuple> it = pigServer.openIterator("a");
- assertEquals(Util.createTuple(new String[] {"foo,bar,baz", null, null}), it.next());
- assertEquals(Util.createTuple(new String[] {"fee", "foe", "fum"}), it.next());
- }
-
- @Test
- public void testQuotedQuotes() throws IOException {
- String inputFileName = "TestCSVExcelStorage-quotedquotes.txt";
- Util.createLocalInputFile(inputFileName,
- new String[] {"\"foo,\"\"bar\"\",baz\"", "\"\"\"\"\"\"\"\""});
- String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " +
- " as (a:chararray); ";
- Util.registerMultiLineQuery(pigServer, script);
- Iterator<Tuple> it = pigServer.openIterator("a");
- assertEquals(Util.createTuple(new String[] {"foo,\"bar\",baz"}), it.next());
- assertEquals(Util.createTuple(new String[] {"\"\"\"\""}), it.next());
- }
-
- @Test
- public void testMultiline() throws IOException {
- // Read the test file:
- String script =
- "a = LOAD '" + testFileCommaName + "' " +
- "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');";
- Util.registerMultiLineQuery(pigServer, script);
- compareExpectedActual(testStrCommaYesMultilineResultTuples, "a");
-
- // Store the test file back down into another file using YES_MULTILINE:
- String testOutFileName = createOutputFileName();
- script = "STORE a INTO '" + testOutFileName + "' USING " +
- "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');";
- pigServer.registerQuery(script);
-
- // Read it back out using YES_MULTILINE, and see whether it's still correct:
- script = "b = LOAD '" + testOutFileName + "' " +
- "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');";
- Util.registerMultiLineQuery(pigServer, script);
- compareExpectedActual(testStrCommaYesMultilineResultTuples, "b");
-
- // Now read it back again, but multilines turned off:
- script = "c = LOAD '" + testOutFileName + "' " +
- "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');";
- Util.registerMultiLineQuery(pigServer, script);
- compareExpectedActual(testStrCommaNoMultilineResultTuples, "c");
-
- // Store this re-read test file back down again, into another file using NO_MULTILINE:
- testOutFileName = createOutputFileName();
- script = "STORE c INTO '" + testOutFileName + "' USING " +
- "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');";
- pigServer.registerQuery(script);
-
- // Read it back in, again with NO_MULTILINE and see whether it's still correct:
- script = "d = LOAD '" + testOutFileName + "' " +
- "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');";
- Util.registerMultiLineQuery(pigServer, script);
- compareExpectedActual(testStrCommaNoMultilineResultTuples, "d");
-
- }
-
- @Test
- public void testTabDelimiter() throws IOException {
- // Read the test file:
- String script =
- "e = LOAD '" + testFileTabName + "' " +
- "USING org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
- Util.registerMultiLineQuery(pigServer, script);
- compareExpectedActual(testStrTabYesMultilineResultTuples, "e");
-
- // Store the test file back down into another file using YES_MULTILINE:
- String testOutFileName = createOutputFileName();
- script = "STORE e INTO '" + testOutFileName + "' USING " +
- "org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
- pigServer.registerQuery(script);
-
- // Read it back out using YES_MULTILINE, and see whether it's still correct:
- script = "f = LOAD '" + testOutFileName + "' " +
- "USING org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
- Util.registerMultiLineQuery(pigServer, script);
- compareExpectedActual(testStrTabYesMultilineResultTuples, "f");
- }
-
- private void compareExpectedActual(ArrayList<Tuple> theExpected, String theActualPigVarAlias) throws IOException {
- Iterator<Tuple> actualIt = pigServer.openIterator(theActualPigVarAlias);
- Iterator<Tuple> expIt = theExpected.iterator();
-
- while (actualIt.hasNext()) {
- Tuple actual = actualIt.next();
- if (!expIt.hasNext())
- Assert.fail("The input contains more records than expected. First unexpected record: " + actual);
- Tuple expected = expIt.next();
- // The following assert does not work, even if
- // the two tuples are identical in class (BinSedesTuple)
- // and content. We need to compare element by element:
- //assertEquals(expected, actual);
- for (int i=0; i<expected.size(); i++) {
- String truthEl = (String) expected.get(i);
- String actualEl = new String(((DataByteArray) actual.get(i)).get());
- assertEquals(truthEl, actualEl);
- }
- }
- }
-
- /*
- * Hack to get a temp file name to store data into.
- * The file must not exist when the caller subsequently
- * tries to write to it. In non-testing code this
- * would be an intolerable race condition. There's
- * likely a better way.
- */
- private String createOutputFileName() throws IOException {
- File f = File.createTempFile("CSVExcelStorageTest", "csv");
- f.deleteOnExit();
- f.delete();
- // On Windows this path will be C:\\..., which
- // causes errors in the Hadoop environment. Replace
- // the backslashes with forward slashes:
- return f.getAbsolutePath().replaceAll("\\\\", "/");
- }
-
- public static void main(String[] args) {
- TestCSVExcelStorage tester = null;
- try {
- tester = new TestCSVExcelStorage();
- tester.testSimpleCsv();
- tester.testQuotedCommas();
- tester.testQuotedQuotes();
- tester.testMultiline();
- tester.testTabDelimiter();
-System.out.println("CSVExcelStorage() passed all tests.");
-
- } catch (ExecException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.storage;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Properties;
+
+import junit.framework.Assert;
+
+import org.apache.commons.lang.StringUtils;
+
+import org.apache.pig.ExecType;
+import org.apache.pig.PigServer;
+import org.apache.pig.data.DataByteArray;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.tools.parameters.ParseException;
+import org.apache.pig.test.Util;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestCSVExcelStorage {
+
+ Properties props = new Properties();
+ ArrayList<String> testMsgs = new ArrayList<String>();
+
+ String testFileCommaName = "testFileComma.csv";
+ String testFileTabName = "testFileTab.csv";
+
+ String testStrComma =
+ "John,Doe,10\n" +
+ "Jane, \"nee, Smith\",20\n" +
+ ",,\n" +
+ "\"Mac \"\"the knife\"\"\",Cohen,30\n" +
+ "\"Conrad\n" +
+ "Emil\",Dinger,40\n" +
+ "1st Field,\"A poem that continues\n" +
+ "for several lines\n" +
+ "do we\n" +
+ "handle that?\",Good,Fairy\n";
+
+ String[] testStrCommaArray =
+ new String[] {
+ "John,Doe,10",
+ "Jane, \"nee, Smith\",20",
+ ",,",
+ "\"Mac \"\"the knife\"\"\",Cohen,30",
+ "\"Conrad\nEmil\",Dinger,40",
+ "Emil,\"\nDinger\",40",
+ "Quote problem,\"My \"\"famous\"\"\nsong\",60",
+ "1st Field,\"A poem that continues\nfor several lines\ndo we\nhandle that?\",Good,Fairy",
+ };
+
+ @SuppressWarnings("serial")
+ ArrayList<Tuple> testStrCommaYesMultilineResultTuples =
+ new ArrayList<Tuple>() {
+ {
+ add(Util.createTuple(new String[] {"John","Doe","10"}));
+ add(Util.createTuple(new String[] {"Jane", " nee, Smith","20"}));
+ add(Util.createTuple(new String[] {"", "", ""}));
+ add(Util.createTuple(new String[] {"Mac \"the knife\"", "Cohen", "30"}));
+ add(Util.createTuple(new String[] {"Conrad\nEmil", "Dinger", "40"}));
+ add(Util.createTuple(new String[] {"Emil", "\nDinger", "40"}));
+ add(Util.createTuple(new String[] {"Quote problem", "My \"famous\"\nsong", "60"}));
+ add(Util.createTuple(new String[] {"1st Field", "A poem that continues\nfor several lines\ndo we\nhandle that?", "Good", "Fairy"}));
+ }
+ };
+
+ @SuppressWarnings("serial")
+ ArrayList<Tuple> testStrCommaNoMultilineResultTuples =
+ new ArrayList<Tuple>() {
+ {
+ add(Util.createTuple(new String[] {"John","Doe","10"}));
+ add(Util.createTuple(new String[] {"Jane", " nee, Smith","20"}));
+ add(Util.createTuple(new String[] {"", "", ""}));
+ add(Util.createTuple(new String[] {"Mac \"the knife\"", "Cohen", "30"}));
+ add(Util.createTuple(new String[] {"Conrad"}));
+ add(Util.createTuple(new String[] {"Emil,Dinger,40"})); // Trailing double quote after Emil eats rest of line
+ add(Util.createTuple(new String[] {"Emil"}));
+ add(Util.createTuple(new String[] {"Dinger,40"})); // Trailing double quote after Emil eats rest of line
+ add(Util.createTuple(new String[] {"Quote problem", "My \"famous\""}));
+ add(Util.createTuple(new String[] {"song,60"}));
+ add(Util.createTuple(new String[] {"1st Field", "A poem that continues"}));
+ add(Util.createTuple(new String[] {"for several lines"}));
+ add(Util.createTuple(new String[] {"do we"}));
+ add(Util.createTuple(new String[] {"handle that?,Good,Fairy"})); // Trailing double quote eats rest of line
+ }
+ };
+
+ String testStrTab =
+ "John\tDoe\t50\n" +
+ "\"Foo and CR last\n" +
+ "bar.\"\t\t\n" +
+ "Frank\tClean\t70";
+
+ String[] testStrTabArray =
+ new String[] {
+ "John\tDoe\t50",
+ "\"Foo and CR last\nbar.\"\t\t",
+ "Frank\tClean\t70"
+ };
+
+ @SuppressWarnings("serial")
+ ArrayList<Tuple> testStrTabYesMultilineResultTuples =
+ new ArrayList<Tuple>() {
+ {
+ add(Util.createTuple(new String[] {"John","Doe","50"}));
+ add(Util.createTuple(new String[] {"Foo and CR last\nbar.","",""}));
+ add(Util.createTuple(new String[] {"Frank","Clean","70"}));
+ }
+ };
+
+ private static final String dataDir = "build/test/tmpdata/";
+ private static final String testFile = "csv_excel_data";
+
+ private PigServer pig;
+
+ @Before
+ public void setup() throws IOException {
+ pig = new PigServer(ExecType.LOCAL);
+ pig.getPigContext().getProperties()
+ .setProperty("mapred.map.max.attempts", "1");
+ pig.getPigContext().getProperties()
+ .setProperty("mapred.reduce.max.attempts", "1");
+ pig.getPigContext().getProperties()
+ .setProperty("mapreduce.job.end-notification.retry.interval", "100");
+
+ Util.deleteDirectory(new File(dataDir));
+
+ pig.mkdirs(dataDir);
+
+ Util.createLocalInputFile(dataDir + testFile,
+ new String[] {
+ "int_field,long_field,float_field,double_field,chararray_field,bytearray_field",
+ "1,10,2.718,3.14159,qwerty,uiop",
+ "1,10,2.718,3.14159,,",
+ "1,10,,3.15159,,uiop",
+ "1,10,,3.15159,,uiop, moose",
+ "1,,\"2.718\",,\"qwerty\",\"uiop\"",
+ "1,,,,\"",
+ "qwe",
+ "rty\", uiop",
+ "1,,,,\"qwe,rty\",uiop",
+ "1,,,,\"q\"\"wert\"\"y\", uiop",
+ "1,,,,qwerty,\"u\"\"io\"\"p\""
+ });
+
+ Util.createLocalInputFile(testFileCommaName, testStrCommaArray);
+ Util.createLocalInputFile(testFileTabName, testStrTabArray);
+ }
+
+ @After
+ public void cleanup() throws IOException {
+ Util.deleteDirectory(new File(dataDir));
+ pig.shutdown();
+ }
+
+ // Load a simple CSV file with no escapes or special options
+ @Test
+ public void testSimpleCsv() throws IOException {
+ String inputFileName = "TestCSVExcelStorage-simple.txt";
+ Util.createLocalInputFile(inputFileName, new String[] {"foo,bar,baz", "fee,foe,fum"});
+ String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " +
+ " as (a:chararray, b:chararray, c:chararray); ";
+ Util.registerMultiLineQuery(pig, script);
+ Iterator<Tuple> it = pig.openIterator("a");
+ Assert.assertEquals(Util.createTuple(new String[] {"foo", "bar", "baz"}), it.next());
+ }
+
+ // Load a field with commas in it (escaped with quotes)
+ @Test
+ public void testQuotedCommas() throws IOException {
+ String inputFileName = "TestCSVExcelStorage-quotedcommas.txt";
+ Util.createLocalInputFile(inputFileName, new String[] {"\"foo,bar,baz\"", "fee,foe,fum"});
+ String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " +
+ " as (a:chararray, b:chararray, c:chararray); ";
+ Util.registerMultiLineQuery(pig, script);
+ Iterator<Tuple> it = pig.openIterator("a");
+ Assert.assertEquals(Util.createTuple(new String[] {"foo,bar,baz", null, null}), it.next());
+ Assert.assertEquals(Util.createTuple(new String[] {"fee", "foe", "fum"}), it.next());
+ }
+
+ // Two quotes characters should be interpreted as a single literal quotes character
+ @Test
+ public void testQuotedQuotes() throws IOException {
+ String inputFileName = "TestCSVExcelStorage-quotedquotes.txt";
+ Util.createLocalInputFile(inputFileName,
+ new String[] {"\"foo,\"\"bar\"\",baz\"", "\"\"\"\"\"\"\"\""});
+ String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " +
+ " as (a:chararray); ";
+ Util.registerMultiLineQuery(pig, script);
+ Iterator<Tuple> it = pig.openIterator("a");
+ Assert.assertEquals(Util.createTuple(new String[] {"foo,\"bar\",baz"}), it.next());
+ Assert.assertEquals(Util.createTuple(new String[] {"\"\"\"\""}), it.next());
+ }
+
+ // Handle newlines in fields
+ @Test
+ public void testMultiline() throws IOException {
+ // Read the test file:
+ String script =
+ "a = LOAD '" + testFileCommaName + "' " +
+ "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');";
+ pig.registerQuery(script);
+ compareExpectedActual(testStrCommaYesMultilineResultTuples, "a");
+
+ // Store the test file back down into another file using YES_MULTILINE:
+ String testOutFileName = createOutputFileName();
+ script = "STORE a INTO '" + testOutFileName + "' USING " +
+ "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX');";
+ pig.registerQuery(script);
+
+ // Read it back out using YES_MULTILINE, and see whether it's still correct:
+ script = "b = LOAD '" + testOutFileName + "' " +
+ "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');";
+ pig.registerQuery(script);
+ compareExpectedActual(testStrCommaYesMultilineResultTuples, "b");
+
+ // Now read it back again, but multilines turned off:
+ script = "c = LOAD '" + testOutFileName + "' " +
+ "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');";
+ pig.registerQuery(script);
+ compareExpectedActual(testStrCommaNoMultilineResultTuples, "c");
+
+ // Store this re-read test file back down again, into another file using NO_MULTILINE:
+ testOutFileName = createOutputFileName();
+ script = "STORE c INTO '" + testOutFileName + "' USING " +
+ "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE', 'UNIX');";
+ pig.registerQuery(script);
+
+ // Read it back in, again with NO_MULTILINE and see whether it's still correct:
+ script = "d = LOAD '" + testOutFileName + "' " +
+ "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');";
+ pig.registerQuery(script);
+ compareExpectedActual(testStrCommaNoMultilineResultTuples, "d");
+ }
+
+ // Handle non-comma delimiters
+ @Test
+ public void testTabDelimiter() throws IOException {
+ // Read the test file:
+ String script =
+ "e = LOAD '" + testFileTabName + "' " +
+ "USING org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
+ pig.registerQuery(script);
+ compareExpectedActual(testStrTabYesMultilineResultTuples, "e");
+
+ // Store the test file back down into another file using YES_MULTILINE:
+ String testOutFileName = createOutputFileName();
+ script = "STORE e INTO '" + testOutFileName + "' USING " +
+ "org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
+ pig.registerQuery(script);
+
+ // Read it back out using YES_MULTILINE, and see whether it's still correct:
+ script = "f = LOAD '" + testOutFileName + "' " +
+ "USING org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');";
+ pig.registerQuery(script);
+ compareExpectedActual(testStrTabYesMultilineResultTuples, "f");
+ }
+
+ private void compareExpectedActual(ArrayList<Tuple> theExpected, String theActualPigVarAlias) throws IOException {
+ Iterator<Tuple> actualIt = pig.openIterator(theActualPigVarAlias);
+ Iterator<Tuple> expIt = theExpected.iterator();
+
+ while (actualIt.hasNext()) {
+ Tuple actual = actualIt.next();
+ if (!expIt.hasNext())
+ Assert.fail("The input contains more records than expected. First unexpected record: " + actual);
+ Tuple expected = expIt.next();
+ // The following assert does not work, even if
+ // the two tuples are identical in class (BinSedesTuple)
+ // and content. We need to compare element by element:
+ //assertEquals(expected, actual);
+ for (int i=0; i<expected.size(); i++) {
+ String truthEl = (String) expected.get(i);
+ String actualEl = new String(((DataByteArray) actual.get(i)).get());
+ Assert.assertEquals(truthEl, actualEl);
+ }
+ }
+ }
+
+ /*
+ * Hack to get a temp file name to store data into.
+ * The file must not exist when the caller subsequently
+ * tries to write to it. In non-testing code this
+ * would be an intolerable race condition. There's
+ * likely a better way.
+ */
+ private String createOutputFileName() throws IOException {
+ File f = File.createTempFile("CSVExcelStorageTest", "csv");
+ f.deleteOnExit();
+ f.delete();
+ // On Windows this path will be C:\\..., which
+ // causes errors in the Hadoop environment. Replace
+ // the backslashes with forward slashes:
+ return f.getAbsolutePath().replaceAll("\\\\", "/");
+ }
+
+ // Comprehensive loader test: uses several datatypes; skips the header;
+ // handles missing/extra fields; handles quotes, commas, newlines
+ @Test
+ public void load() throws IOException, ParseException {
+ String schema = "i: int, l: long, f: float, d: double, c: chararray, b: bytearray";
+
+ pig.registerQuery(
+ "data = load '" + dataDir + testFile + "' " +
+ "using org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'SKIP_INPUT_HEADER') " +
+ "AS (" + schema + ");"
+ );
+
+ Iterator<Tuple> data = pig.openIterator("data");
+ String[] expected = {
+ // a header in csv_excel_data.csv should be skipped due to 'SKIP_INPUT_HEADER' being set in test_csv_storage_load.pig
+ "(1,10,2.718,3.14159,qwerty,uiop)", // basic data types
+ "(1,10,2.718,3.14159,,)", // missing fields at end
+ "(1,10,,3.15159,,uiop)", // missing field in the middle
+ "(1,10,,3.15159,,uiop)", // extra field (input data has "moose" after "uiop")
+ "(1,,2.718,,qwerty,uiop)", // quoted regular fields (2.718, qwerty, and uiop in quotes)
+ "(1,,,,\nqwe\nrty, uiop)", // newlines in quotes
+ "(1,,,,qwe,rty,uiop)", // commas in quotes
+ "(1,,,,q\"wert\"y, uiop)", // quotes in quotes
+ "(1,,,,qwerty,u\"io\"p)" // quotes in quotes at the end of a line
+ };
+
+ Assert.assertEquals(StringUtils.join(expected, "\n"), StringUtils.join(data, "\n"));
+ }
+
+ // Comprehensive storer test for non-container fields:
+ // uses several datatypes, writes a header, handle nulls, quotes, commas, newlines
+ @Test
+ public void storeScalarTypes() throws IOException, ParseException {
+ String input = testFile;
+ String schema = "int_field: int, long_field: long, float_field: float, double_field: double, " +
+ "chararray_field: chararray, bytearray_field: bytearray";
+ String output = "csv_excel_scalar_output";
+
+ // Store data
+
+ pig.registerQuery(
+ "data = load '" + dataDir + input + "' " +
+ "using org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'SKIP_INPUT_HEADER') " +
+ "AS (" + schema + ");"
+ );
+ pig.store("data", dataDir + output,
+ "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'WRITE_OUTPUT_HEADER')");
+
+ // Read it back
+
+ pig.registerQuery(
+ "data = load '" + dataDir + output + "' " +
+ "using TextLoader() as (line: chararray);"
+ );
+
+ Iterator<Tuple> data = pig.openIterator("data");
+ String[] expected = {
+ // header should be written because we used the 'WRITE_OUTPUT_HEADER' argument
+ "(int_field,long_field,float_field,double_field,chararray_field,bytearray_field)",
+ "(1,10,2.718,3.14159,qwerty,uiop)",
+ "(1,10,2.718,3.14159,,)",
+ "(1,10,,3.15159,,uiop)",
+ "(1,10,,3.15159,,uiop)",
+ "(1,,2.718,,qwerty,uiop)",
+ "(1,,,,\")", // since we are just using TextLoader for verification
+ "(qwe)", // it treats the linebreaks as meaning separate records
+ "(rty\", uiop)", // but as shown in the load() test, CSVExcelStorage will read these properly
+ "(1,,,,\"qwe,rty\",uiop)",
+ "(1,,,,\"q\"\"wert\"\"y\", uiop)",
+ "(1,,,,qwerty,\"u\"\"io\"\"p\")"
+ };
+
+ Assert.assertEquals(StringUtils.join(expected, "\n"), StringUtils.join(data, "\n"));
+ }
+
+ // Test that tuples/bags/maps are stored as strings
+ @Test
+ public void storeComplexTypes() throws IOException, ParseException {
+ String input = "csv_excel_complex_input";
+ String schema = "a:(b:int,c:int),d:(e:int,f:(g:int,h:int)),i:{j:(k:int,l:int)},m:{n:(o:int,p:{q:(r:int,s:int)})},t:[int],u:[[int]]";
+ String output = "csv_excel_complex_output";
+
+ Util.createLocalInputFile(dataDir + input,
+ new String[] {
+ "(1,2)|(1,(2,3))|{(1,2),(3,4)}|{(1,{(2,3),(4,5)}),(6,{(7,8),(9,0)})}|[a#1,b#2]|[a#[b#1,c#2],d#[e#3,f#4]]",
+ "(1,)|(1,(2,))|{(1,),(3,)}|{(1,{(,3),(,5)}),(6,{(7,),(9,)})}|[a#,b#2]|[a#[b#,c#2],d#]"
+ });
+
+ pig.registerQuery(
+ "data = load '" + dataDir + input + "' " +
+ "using PigStorage('|')" +
+ "AS (" + schema + ");"
+ );
+ pig.store("data", dataDir + output,
+ "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'SKIP_OUTPUT_HEADER')");
+
+ pig.registerQuery(
+ "data = load '" + dataDir + output + "' " +
+ "using TextLoader() as (line: chararray);"
+ );
+
+ Iterator<Tuple> data = pig.openIterator("data");
+ String[] expected = {
+ "(\"(1,2)\",\"(1,(2,3))\",\"{(1,2),(3,4)}\",\"{(1,{(2,3),(4,5)}),(6,{(7,8),(9,0)})}\",\"{b=2, a=1}\",\"{d={f=4, e=3}, a={b=1, c=2}}\")",
+ "(\"(1,)\",\"(1,(2,))\",\"{(1,),(3,)}\",\"{(1,{(,3),(,5)}),(6,{(7,),(9,)})}\",\"{b=2, a=null}\",\"{d=null, a={b=null, c=2}}\")"
+ };
+
+ Assert.assertEquals(StringUtils.join(expected, "\n"), StringUtils.join(data, "\n"));
+ }
+}