You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2008/10/09 19:31:59 UTC
svn commit: r703209 - in /incubator/pig/trunk: ./
contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/
contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/
Author: gates
Date: Thu Oct 9 10:31:58 2008
New Revision: 703209
URL: http://svn.apache.org/viewvc?rev=703209&view=rev
Log:
Pig-472 Added RegExLoader to piggybank, an abstract loader class to parse text files via regular espressions
Added:
incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java
incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java
Modified:
incubator/pig/trunk/CHANGES.txt
Modified: incubator/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/CHANGES.txt?rev=703209&r1=703208&r2=703209&view=diff
==============================================================================
--- incubator/pig/trunk/CHANGES.txt (original)
+++ incubator/pig/trunk/CHANGES.txt Thu Oct 9 10:31:58 2008
@@ -354,3 +354,6 @@
PIG-342: Fix DistinctDataBag to recalculate size after it has spilled. (bdimcheff via gates)
+ PIG-472: Added RegExLoader to piggybank, an abstract loader class to parse
+ text files via regular espressions (spackest via gates)
+
Added: incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java?rev=703209&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/RegExLoader.java Thu Oct 9 10:31:58 2008
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.storage;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pig.ReversibleLoadStoreFunc;
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Datum;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.io.BufferedPositionedInputStream;
+
+/**
+ * RegExLoader is an abstract class used to parse logs based on a regular expression.
+ *
+ * There is a single abstract method, getPattern which needs to return a Pattern. Each group will be returned
+ * as a different DataAtom.
+ *
+ * Look to org.apache.pig.piggybank.storage.apachelog.CommonLogLoader for example usage.
+ */
+
+public abstract class RegExLoader implements ReversibleLoadStoreFunc {
+ protected BufferedPositionedInputStream in = null;
+ long end = Long.MAX_VALUE;
+ private byte recordDel = (byte) '\n';
+ private String fieldDel = "\t";
+ final private static Charset utf8 = Charset.forName("UTF8");
+ OutputStream os;
+
+ abstract public Pattern getPattern();
+
+ public RegExLoader() {
+ }
+
+ public Tuple getNext() throws IOException {
+ if (in == null || in.getPosition() > end) {
+ return null;
+ }
+
+ Pattern pattern = getPattern();
+ Matcher matcher = pattern.matcher("");
+
+ String line;
+ if ((line = in.readLine(utf8, recordDel)) != null) {
+ if (line.length() > 0 && line.charAt(line.length() - 1) == '\r')
+ line = line.substring(0, line.length() - 1);
+
+ matcher.reset(line);
+ if (matcher.find()) {
+ ArrayList<Datum> list = new ArrayList<Datum>();
+
+ for (int i = 1; i <= matcher.groupCount(); i++) {
+ list.add(new DataAtom(matcher.group(i)));
+ }
+ return new Tuple(list);
+ }
+ }
+ return null;
+ }
+
+ public void bindTo(String fileName, BufferedPositionedInputStream in, long offset, long end) throws IOException {
+ this.in = in;
+ this.end = end;
+
+ // Since we are not block aligned we throw away the first
+ // record and could on a different instance to read it
+ if (offset != 0) {
+ getNext();
+ }
+ }
+
+ public void bindTo(OutputStream os) throws IOException {
+ this.os = os;
+ }
+
+ public void putNext(Tuple f) throws IOException {
+ os.write((f.toDelimitedString(this.fieldDel) + (char) this.recordDel).getBytes("utf8"));
+ }
+
+ public void finish() throws IOException {
+ }
+}
Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java?rev=703209&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestHelper.java Thu Oct 9 10:31:58 2008
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.storage;
+
+import java.io.File;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.data.DataAtom;
+import org.apache.pig.data.Tuple;
+import org.junit.Test;
+
+public class TestHelper extends TestCase {
+ @Test
+ public void testTest() {
+ assertTrue(true);
+ }
+
+
+ public static ArrayList<String[]> getExpected(ArrayList<String[]> data, Pattern pattern) {
+ ArrayList<String[]> expected = new ArrayList<String[]>();
+ for (int i = 0; i < data.size(); i++) {
+ String string = data.get(i)[0];
+ Matcher matcher = pattern.matcher(string);
+ matcher.groupCount();
+ matcher.find();
+ String[] toAdd = new String[] { matcher.group(1), matcher.group(2), matcher.group(3) };
+ expected.add(toAdd);
+ }
+
+ return expected;
+ }
+
+ private static String join(String delimiter, String[] strings) {
+ String string = strings[0];
+ for (int i = 1; i < strings.length; i++) {
+ string += delimiter + strings[i];
+ }
+ return string;
+ }
+
+ public static void examineTuple(ArrayList<String[]> expectedData, Tuple tuple, int tupleCount) {
+ for (int i = 0; i < tuple.arity(); i++) {
+ DataAtom dataAtom = tuple.getAtomField(i);
+ String expected = expectedData.get(tupleCount)[i];
+ String actual = dataAtom.toString();
+ assertEquals(expected, actual);
+ }
+ }
+
+ public static String createTempFile(ArrayList<String[]> myData, String delimiter) throws Exception {
+ File tmpFile = File.createTempFile("test", ".txt");
+ if (tmpFile.exists()) {
+ tmpFile.delete();
+ }
+ PrintWriter pw = new PrintWriter(tmpFile);
+ for (int i = 0; i < myData.size(); i++) {
+ pw.println(join(delimiter, myData.get(i)));
+ }
+ pw.close();
+ tmpFile.deleteOnExit();
+ return tmpFile.getAbsolutePath();
+ }
+}
Added: incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java
URL: http://svn.apache.org/viewvc/incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java?rev=703209&view=auto
==============================================================================
--- incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java (added)
+++ incubator/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestRegExLoader.java Thu Oct 9 10:31:58 2008
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.storage;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Properties;
+import java.util.regex.Pattern;
+
+import junit.framework.TestCase;
+
+import org.apache.pig.PigServer.ExecType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.PigContext;
+import org.apache.pig.impl.io.BufferedPositionedInputStream;
+import org.apache.pig.impl.io.FileLocalizer;
+import org.apache.pig.piggybank.storage.RegExLoader;
+import org.junit.Test;
+
+public class TestRegExLoader extends TestCase {
+ private static String patternString = "(\\w+),(\\w+);(\\w+)";
+ private final static Pattern pattern = Pattern.compile(patternString);
+
+ class DummyRegExLoader extends RegExLoader {
+ @Override
+ public Pattern getPattern() {
+ return Pattern.compile(patternString);
+ }
+ }
+
+ public static ArrayList<String[]> data = new ArrayList<String[]>();
+ static {
+ data.add(new String[] { "1,one;i" });
+ data.add(new String[] { "2,two;ii" });
+ data.add(new String[] { "3,three;iii" });
+ }
+
+ @Test
+ public void testLoadFromBindTo() throws Exception {
+ String filename = TestHelper.createTempFile(data, " ");
+ DummyRegExLoader dummyRegExLoader = new DummyRegExLoader();
+ PigContext pigContext = new PigContext(ExecType.LOCAL, new Properties());
+ InputStream inputStream = FileLocalizer.open(filename, pigContext);
+ dummyRegExLoader.bindTo(filename, new BufferedPositionedInputStream(inputStream), 0, Long.MAX_VALUE);
+
+ ArrayList<String[]> expected = TestHelper.getExpected(data, pattern);
+ int tupleCount = 0;
+
+ while (true) {
+ Tuple tuple = dummyRegExLoader.getNext();
+ if (tuple == null)
+ break;
+ else {
+ TestHelper.examineTuple(expected, tuple, tupleCount);
+ tupleCount++;
+ }
+ }
+ assertEquals(data.size(), tupleCount);
+ }
+}