You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2008/10/09 19:31:59 UTC

svn commit: r703209 - in /incubator/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/ contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/

Author: gates
Date: Thu Oct  9 10:31:58 2008
New Revision: 703209

URL: http://svn.apache.org/viewvc?rev=703209&view=rev
Log:
Pig-472 Added RegExLoader to piggybank, an abstract loader class to parse text files via regular espressions

Added:
    incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java
    incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java
    incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java
Modified:
    incubator/pig/trunk/CHANGES.txt

Modified: incubator/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=703209&r1=703208&r2=703209&view=diff
==============================================================================
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Thu Oct  9 10:31:58 2008
@@ -354,3 +354,6 @@
 
     PIG-342: Fix DistinctDataBag to recalculate size after it has spilled. (bdimcheff via gates)
 
+	PIG-472: Added RegExLoader to piggybank, an abstract loader class to parse
+	text files via regular espressions (spackest via gates)
+

Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java?rev=703209&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java Thu Oct  9 10:31:58 2008
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.storage;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pig.ReversibleLoadStoreFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Datum;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.io.BufferedPositionedInputStream;
+
+/**
+ * RegExLoader is an abstract class used to parse logs based on a regular expression.
+ * 
+ * There is a single abstract method, getPattern which needs to return a Pattern. Each group will be returned
+ * as a different DataAtom.
+ * 
+ * Look to org.apache.pig.piggybank.storage.apachelog.CommonLogLoader for example usage.
+ */
+
+public abstract class RegExLoader implements ReversibleLoadStoreFunc {
+    protected BufferedPositionedInputStream in = null;
+    long end = Long.MAX_VALUE;
+    private byte recordDel = (byte) '\n';
+    private String fieldDel = "\t";
+    final private static Charset utf8 = Charset.forName("UTF8");
+    OutputStream os;
+
+    abstract public Pattern getPattern();
+
+    public RegExLoader() {
+    }
+
+    public Tuple getNext() throws IOException {
+        if (in == null || in.getPosition() > end) {
+            return null;
+        }
+
+        Pattern pattern = getPattern();
+        Matcher matcher = pattern.matcher("");
+
+        String line;
+        if ((line = in.readLine(utf8, recordDel)) != null) {
+            if (line.length() > 0 && line.charAt(line.length() - 1) == '\r')
+                line = line.substring(0, line.length() - 1);
+
+            matcher.reset(line);
+            if (matcher.find()) {
+                ArrayList<Datum> list = new ArrayList<Datum>();
+
+                for (int i = 1; i <= matcher.groupCount(); i++) {
+                    list.add(new DataAtom(matcher.group(i)));
+                }
+                return new Tuple(list);
+            }
+        }
+        return null;
+    }
+
+    public void bindTo(String fileName, BufferedPositionedInputStream in, long offset, long end) throws IOException {
+        this.in = in;
+        this.end = end;
+
+        // Since we are not block aligned we throw away the first
+        // record and could on a different instance to read it
+        if (offset != 0) {
+            getNext();
+        }
+    }
+
+    public void bindTo(OutputStream os) throws IOException {
+        this.os = os;
+    }
+
+    public void putNext(Tuple f) throws IOException {
+        os.write((f.toDelimitedString(this.fieldDel) + (char) this.recordDel).getBytes("utf8"));
+    }
+
+    public void finish() throws IOException {
+    }
+}

Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java?rev=703209&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java Thu Oct  9 10:31:58 2008
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.storage;
+
+import java.io.File;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+import org.junit.Test;
+
+public class TestHelper extends TestCase {
+    @Test
+    public void testTest() {
+        assertTrue(true);
+    }
+
+
+    public static ArrayList<String[]> getExpected(ArrayList<String[]> data, Pattern pattern) {
+        ArrayList<String[]> expected = new ArrayList<String[]>();
+        for (int i = 0; i < data.size(); i++) {
+            String string = data.get(i)[0];
+            Matcher matcher = pattern.matcher(string);
+            matcher.groupCount();
+            matcher.find();
+            String[] toAdd = new String[] { matcher.group(1), matcher.group(2), matcher.group(3) };
+            expected.add(toAdd);
+        }
+
+        return expected;
+    }
+
+    private static String join(String delimiter, String[] strings) {
+        String string = strings[0];
+        for (int i = 1; i < strings.length; i++) {
+            string += delimiter + strings[i];
+        }
+        return string;
+    }
+
+    public static void examineTuple(ArrayList<String[]> expectedData, Tuple tuple, int tupleCount) {
+        for (int i = 0; i < tuple.arity(); i++) {
+            DataAtom dataAtom = tuple.getAtomField(i);
+            String expected = expectedData.get(tupleCount)[i];
+            String actual = dataAtom.toString();
+            assertEquals(expected, actual);
+        }
+    }
+
+    public static String createTempFile(ArrayList<String[]> myData, String delimiter) throws Exception {
+        File tmpFile = File.createTempFile("test", ".txt");
+        if (tmpFile.exists()) {
+            tmpFile.delete();
+        }
+        PrintWriter pw = new PrintWriter(tmpFile);
+        for (int i = 0; i < myData.size(); i++) {
+            pw.println(join(delimiter, myData.get(i)));
+        }
+        pw.close();
+        tmpFile.deleteOnExit();
+        return tmpFile.getAbsolutePath();
+    }
+}

Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java?rev=703209&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java Thu Oct  9 10:31:58 2008
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.storage;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Properties;
+import java.util.regex.Pattern;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.PigServer.ExecType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.PigContext;
+import org.apache.pig.impl.io.BufferedPositionedInputStream;
+import org.apache.pig.impl.io.FileLocalizer;
+import org.apache.pig.piggybank.storage.RegExLoader;
+import org.junit.Test;
+
+public class TestRegExLoader extends TestCase {
+    private static String patternString = "(\\w+),(\\w+);(\\w+)";
+    private final static Pattern pattern = Pattern.compile(patternString);
+
+    class DummyRegExLoader extends RegExLoader {
+        @Override
+        public Pattern getPattern() {
+            return Pattern.compile(patternString);
+        }
+    }
+
+    public static ArrayList<String[]> data = new ArrayList<String[]>();
+    static {
+        data.add(new String[] { "1,one;i" });
+        data.add(new String[] { "2,two;ii" });
+        data.add(new String[] { "3,three;iii" });
+    }
+
+    @Test
+    public void testLoadFromBindTo() throws Exception {
+        String filename = TestHelper.createTempFile(data, " ");
+        DummyRegExLoader dummyRegExLoader = new DummyRegExLoader();
+        PigContext pigContext = new PigContext(ExecType.LOCAL, new Properties());
+        InputStream inputStream = FileLocalizer.open(filename, pigContext);
+        dummyRegExLoader.bindTo(filename, new BufferedPositionedInputStream(inputStream), 0, Long.MAX_VALUE);
+
+        ArrayList<String[]> expected = TestHelper.getExpected(data, pattern);
+        int tupleCount = 0;
+
+        while (true) {
+            Tuple tuple = dummyRegExLoader.getNext();
+            if (tuple == null)
+                break;
+            else {
+                TestHelper.examineTuple(expected, tuple, tupleCount);
+                tupleCount++;
+            }
+        }
+        assertEquals(data.size(), tupleCount);
+    }
+}