You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ol...@apache.org on 2009/05/04 22:57:17 UTC
svn commit: r771438 - in /hadoop/pig/trunk: CHANGES.txt
src/org/apache/pig/builtin/RANDOM.java
src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt
test/org/apache/pig/test/TestSample.java
Author: olga
Date: Mon May 4 20:57:17 2009
New Revision: 771438
URL: http://svn.apache.org/viewvc?rev=771438&view=rev
Log:
PIG-795: support for SAMPLE command
Added:
hadoop/pig/trunk/src/org/apache/pig/builtin/RANDOM.java
hadoop/pig/trunk/test/org/apache/pig/test/TestSample.java
Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt
Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=771438&r1=771437&r2=771438&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Mon May 4 20:57:17 2009
@@ -610,3 +610,5 @@
PIG-652: Adapt changes in store interface to multi-query changes (hagleitn via gates).
PIG-627: multiquery support phase 3 (hagleitn and Richard Ding via olgan)
+
+ PIG-795: support for SAMPLE command (ericg via olgan)
Added: hadoop/pig/trunk/src/org/apache/pig/builtin/RANDOM.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/RANDOM.java?rev=771438&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/RANDOM.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/RANDOM.java Mon May 4 20:57:17 2009
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.data.DataType;
+
+public class RANDOM extends EvalFunc<Double>{
+
+ public Double exec(Tuple input) throws IOException {
+ return Math.random();
+ }
+
+ @Override
+ public Schema outputSchema(Schema input) {
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.DOUBLE));
+ }
+}
Modified: hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt?rev=771438&r1=771437&r2=771438&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt (original)
+++ hadoop/pig/trunk/src/org/apache/pig/impl/logicalLayer/parser/QueryParser.jjt Mon May 4 20:57:17 2009
@@ -41,6 +41,7 @@
import org.apache.pig.ExecType;
import org.apache.pig.impl.io.*;
import org.apache.pig.builtin.PigStorage;
+import org.apache.pig.builtin.RANDOM;
import org.apache.pig.impl.builtin.GFAny;
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.commons.logging.Log;
@@ -842,6 +843,7 @@
TOKEN : { <STDIN: "stdin"> }
TOKEN : { <STDOUT: "stdout"> }
TOKEN : { <LIMIT: "limit"> }
+TOKEN : { <SAMPLE: "sample"> }
TOKEN:
{
@@ -1080,6 +1082,7 @@
| ((<GROUP> | <COGROUP>) op = CogroupClause(lp))
| (<FILTER> op = FilterClause(lp))
| (<LIMIT> op = LimitClause(lp))
+| (<SAMPLE> op = SampleClause(lp))
| (<ORDER> op = OrderClause(lp))
| (<DISTINCT> op = NestedExpr(lp)
{
@@ -1224,6 +1227,48 @@
}
}
+/*
+ * "SAMPLE a x" is translated to "FILTER a BY RANDOM()<x"
+ */
+LogicalOperator SampleClause(LogicalPlan lp) :
+{
+ ExpressionOperator cond;
+ LogicalOperator input;
+ Token t;
+ LogicalPlan conditionPlan = new LogicalPlan();
+ log.trace("Entering SampleClause");
+}
+{
+ (
+ input = NestedExpr(lp) {log.debug("Filter input: " + input);}
+ t = <DOUBLENUMBER>
+ )
+ {
+ LOUserFunc rand = new LOUserFunc(conditionPlan, new OperatorKey(scope, getNextId()), new FuncSpec(RANDOM.class.getName()), DataType.DOUBLE);
+ conditionPlan.add(rand);
+
+ double l = Double.parseDouble(t.image);
+ LOConst prob = new LOConst(conditionPlan, new OperatorKey(scope, getNextId()), l);
+ conditionPlan.add(prob);
+
+ cond = new LOLesserThanEqual(conditionPlan, new OperatorKey(scope, getNextId()));
+ conditionPlan.add(cond);
+ conditionPlan.connect(rand, cond);
+ conditionPlan.connect(prob, cond);
+
+ LogicalOperator filter = new LOFilter(lp, new OperatorKey(scope, getNextId()), conditionPlan);
+ addAlias(input.getAlias(), input);
+ lp.add(filter);
+ log.debug("Added operator " + filter.getClass().getName() + " to the logical plan");
+
+ lp.connect(input, filter);
+ log.debug("Connected alias " + input.getAlias() + " operator " + input.getClass().getName() + " to operator " + filter.getClass().getName() +" in the logical plan");
+
+ log.trace("Exiting SampleClause");
+ return filter;
+ }
+}
+
LogicalOperator LimitClause(LogicalPlan lp):
{
LogicalOperator input;
Added: hadoop/pig/trunk/test/org/apache/pig/test/TestSample.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestSample.java?rev=771438&view=auto
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/TestSample.java (added)
+++ hadoop/pig/trunk/test/org/apache/pig/test/TestSample.java Mon May 4 20:57:17 2009
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.util.Iterator;
+
+import junit.framework.TestCase;
+
+import org.junit.Test;
+
+import org.apache.pig.ExecType;
+import org.apache.pig.PigServer;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+
+public class TestSample
+extends TestCase
+{
+ private PigServer pig;
+ private File tmpFile;
+ private String tmpfilepath;
+
+ private int DATALEN = 1024;
+
+ @Override
+ protected void setUp()
+ throws Exception
+ {
+ //pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
+ pig = new PigServer("local");
+
+ tmpFile = File.createTempFile( this.getName(), ".txt");
+ PrintStream ps = new PrintStream(new FileOutputStream(tmpFile));
+ for(int i = 0; i < DATALEN; i++) {
+ ps.println(i);
+ }
+ ps.close();
+
+ tmpfilepath = Util.generateURI(tmpFile.getCanonicalPath());
+ }
+
+ protected void tearDown()
+ throws Exception
+ {
+ tmpFile.delete();
+ }
+
+ private void verify(String query, int expected_min, int expected_max)
+ throws Exception
+ {
+ System.out.println("[TestSample] Query: "+query);
+ pig.registerQuery(query);
+
+ int count = 0;
+ Iterator<Tuple> it = pig.openIterator("myid");
+ while (it.hasNext()) {
+ it.next();
+ count ++;
+ }
+
+ boolean closeEnough = ((expected_min<=count) && (count<=expected_max));
+ System.out.println("[TestSample] Result: "+expected_min+"<="+count+"<="+expected_max+" -> "+closeEnough);
+ assertTrue("Count outside expected range", closeEnough);
+ }
+
+ @Test
+ public void testSample_None()
+ throws Exception
+ {
+ verify("myid = sample (load '"+ tmpfilepath + "') 0.0;", 0, 0);
+ }
+
+ @Test
+ public void testSample_All()
+ throws Exception
+ {
+ verify("myid = sample (load '"+ tmpfilepath + "') 1.0;", DATALEN, DATALEN);
+ }
+
+ @Test
+ public void testSample_Some()
+ throws Exception
+ {
+ verify("myid = sample (load '"+ tmpfilepath + "') 0.5;", DATALEN/3, DATALEN*2/3);
+ }
+}