You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2010/06/23 05:37:13 UTC
svn commit: r957100 [2/2] - in /hadoop/pig/trunk: ./ src/org/apache/pig/builtin/ test/org/apache/pig/test/

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * <dl>
+ * <dt><b>Syntax:</b></dt>
+ * <dd><code>String RegexExtractAll(String expression, String regex)</code>.</dd>
+ * <dt><b>Input:</b></dt>
+ * <dd><code>expression</code>-<code>source string</code>.</dd>
+ * <dd><code>regex</code>-<code>regular expression</code>.</dd>
+ * <dt><b>Output:</b></dt>
+ * <dd><code>A tuple of matched strings</code>.</dd>
+ * </dl>
+ */
+
+public class REGEX_EXTRACT_ALL extends EvalFunc<Tuple> {
+
+    private static TupleFactory tupleFactory = TupleFactory.getInstance();
+
+    @Override
+    public Tuple exec(Tuple input) throws IOException {
+        if (input.size()!=2) {
+            String msg = "RegexExtractAll : Only 2 parameters are allowed.";
+            throw new IOException(msg);
+        }
+
+        if (input.get(0)==null)
+            return null;
+        try {
+            if (!input.get(1).equals(mExpression)) {
+                try {
+                    mExpression = (String)input.get(1);
+                    mPattern = Pattern.compile(mExpression);
+                } catch (Exception e) {
+                    String msg = "RegexExtractAll : Mal-Formed Regular expression : "+input.get(1);
+                    throw new IOException(msg);
+                }
+            }
+        } catch (NullPointerException e) {
+            String msg = "RegexExtractAll : Regular expression is null";
+            throw new IOException(msg);
+        }
+
+        Matcher m = mPattern.matcher((String)input.get(0));
+        if (!m.matches()) {
+            return null;
+        }
+        Tuple result = tupleFactory.newTuple(m.groupCount());
+        for (int i = 0; i< m.groupCount(); i++) {
+            result.set(i, m.group(i+1));
+        }
+        return result;
+    }
+
+    String mExpression = null;
+    Pattern mPattern = null; 
+    @Override
+    public Schema outputSchema(Schema input) {
+        try {
+            return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), 
+                    DataType.TUPLE));
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+        Schema s = new Schema();
+        s.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        s.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        funcList.add(new FuncSpec(this.getClass().getName(), s));
+        return funcList;
+    } 
+}
+

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/REPLACE.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/REPLACE.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/REPLACE.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/REPLACE.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+
+/**
+ * REPLACE implements eval function to replace part of a string.
+ * Example:<code>
+ *      A = load 'mydata' as (name);
+ *      B = foreach A generate REPLACE(name, 'blabla', 'bla');
+ * The first argument is a string on which to perform the operation. The second argument
+ * is treated as a regular expression. The third argument is the replacement string.
+ * This is a wrapper around Java's String.replaceAll(String, String);
+ * 
+ */
+public class REPLACE extends EvalFunc<String>
+{
+    /**
+     * Method invoked on every tuple during foreach evaluation
+     * @param input tuple; first column is assumed to have the column to convert
+     * @exception java.io.IOException
+     */
+    public String exec(Tuple input) throws IOException {
+        if (input == null || input.size() < 3)
+            return null;
+
+        try{
+            String source = (String)input.get(0);
+            String target = (String)input.get(1);
+            String replacewith = (String)input.get(2);
+            return source.replaceAll(target, replacewith);
+        }catch(Exception e){
+            log.warn("Failed to process input; error - " + e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    public Schema outputSchema(Schema input) {
+        return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
+    }
+
+}
\ No newline at end of file

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/ROUND.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/ROUND.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/ROUND.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/ROUND.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * ROUND implements a binding to the Java function
+ * {@link java.lang.Math#round(double) Math.round(double)}. 
+ * Given a single data atom it Returns the closest long to the argument.
+ * 
+ */
+public class ROUND extends EvalFunc<Long>{
+	/**
+	 * java level API
+	 * @param input expects a single numeric value
+	 * @return output returns a single numeric value, 
+	 * the closest long to the argument
+	 */
+	@Override
+	public Long exec(Tuple input) throws IOException {
+        if (input == null || input.size() == 0)
+            return null;
+
+        try{
+            Double d =  DataType.toDouble(input.get(0));
+		    return Math.round(d);
+        } catch (NumberFormatException nfe){
+            System.err.println("Failed to process input; error - " + nfe.getMessage());
+            return null;
+        } catch (Exception e){
+            throw new IOException("Caught exception processing input row ", e);
+        }
+	}
+	
+	@Override
+	public Schema outputSchema(Schema input) {
+        return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.LONG));
+	}
+
+    /* (non-Javadoc)
+     * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+     */
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+        funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.BYTEARRAY))));
+        funcList.add(new FuncSpec(DoubleRound.class.getName(),  new Schema(new Schema.FieldSchema(null, DataType.DOUBLE))));
+        funcList.add(new FuncSpec(FloatRound.class.getName(),   new Schema(new Schema.FieldSchema(null, DataType.FLOAT))));
+        return funcList;
+    }
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/SIN.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/SIN.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/SIN.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/SIN.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+/**
+ * SIN implements a binding to the Java function
+ * {@link java.lang.Math#sin(double) Math.sin(double)}. 
+ * Given a single data atom it Returns the sine of the argument.
+ *
+ */
+public class SIN extends DoubleBase{
+	Double compute(Double input){
+		return Math.sin(input);
+		
+	}
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/SINH.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/SINH.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/SINH.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/SINH.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+/**
+ * SINH implements a binding to the Java function
+ * {@link java.lang.Math#sinh(double) Math.sinh(double)}. 
+ * Given a single data atom it Returns the hyperbolic sine of the argument.
+ *
+ */
+public class SINH extends DoubleBase{
+	Double compute(Double input){
+		return Math.sinh(input);
+	}
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/SPLIT.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/SPLIT.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/SPLIT.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/SPLIT.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+
+import java.util.Arrays;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+
+/**
+ * Wrapper around Java's String.split<br>
+ * input tuple: first column is assumed to have a string to split;<br>
+ * the optional second column is assumed to have the delimiter or regex to split on;<br>
+ * if not provided, it's assumed to be '\s' (space)<br>
+ * the optional third column may provide a limit to the number of results.<br>
+ * If limit is not provided, 0 is assumed, as per Java's split().
+ */
+
+public class SPLIT extends EvalFunc<Tuple> {
+
+    private final static TupleFactory tupleFactory = TupleFactory.getInstance();
+
+    /**
+     * Wrapper around Java's String.split
+     * @param input tuple; first column is assumed to have a string to split;
+     * the optional second column is assumed to have the delimiter or regex to split on;<br>
+     * if not provided, it's assumed to be '\s' (space)
+     * the optional third column may provide a limit to the number of results.<br>
+     * If limit is not provided, 0 is assumed, as per Java's split().
+     * @exception java.io.IOException
+     */
+    public Tuple exec(Tuple input) throws IOException {
+        if (input == null || input.size() < 1)
+            return null;
+        try {
+            String source = (String) input.get(0);
+            String delim = (input.size() > 1 ) ? (String) input.get(1) : "\\s";
+            int length = (input.size() > 2) ? (Integer) input.get(2) : 0;
+            if (source == null || delim == null) {
+                return null;
+            }
+            String[] splits = source.split(delim, length); 
+            return tupleFactory.newTuple(Arrays.asList(splits));
+        } catch (ClassCastException e) {
+            log.warn("class cast exception at "+e.getStackTrace()[0]);
+        } catch (PatternSyntaxException e) {
+            log.warn(e.getMessage());
+        }
+        // this only happens if the try block did not complete normally
+        return null;
+    }
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/SQRT.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/SQRT.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/SQRT.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/SQRT.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+/**
+ * SQRT implements a binding to the Java function
+ * {@link java.lang.Math#sqrt(double) Math.sqrt(double)}. 
+ * Given a single data atom it Returns the square root of the argument.
+ *
+ */
+public class SQRT extends DoubleBase{
+	Double compute(Double input){
+		return Math.sqrt(input);
+	}
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/SUBSTRING.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/SUBSTRING.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/SUBSTRING.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/SUBSTRING.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * SUBSTRING implements eval function to get a part of a string.
+ * Example:<code>
+ *      A = load 'mydata' as (name);
+ *      B = foreach A generate SUBSTRING(name, 10, 12);
+ *      </code>
+ * First argument is the string to take a substring of.<br>
+ * Second argument is the index of the first character of substring.<br>
+ * Third argument is the index of the last character of substring.<br>
+ * if the last argument is past the end of the string, substring of (beginIndex, length(str)) is returned.
+ */
+public class SUBSTRING extends EvalFunc<String> {
+
+    /**
+     * Method invoked on every tuple during foreach evaluation
+     * @param input tuple; first column is assumed to have the column to convert
+     * @exception java.io.IOException
+     */
+    public String exec(Tuple input) throws IOException {
+        if (input == null || input.size() < 3) {
+            log.warn("invalid number of arguments to SUBSTRING");
+            return null;
+        }
+        try {
+            String source = (String)input.get(0);
+            Integer beginindex = (Integer)input.get(1);
+            Integer endindex = (Integer)input.get(2);
+            return source.substring(beginindex, Math.min(source.length(), endindex));
+        } catch (NullPointerException npe) {
+            log.warn(npe.toString());
+            return null;
+        } catch (ClassCastException e) {
+            log.warn(e.toString());
+            return null;
+        }
+    }
+
+    @Override
+    public Schema outputSchema(Schema input) {
+        return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
+    }
+
+}
\ No newline at end of file

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/TAN.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/TAN.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/TAN.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/TAN.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+/**
+ * TAN implements a binding to the Java function
+ * {@link java.lang.Math#tan(double) Math.tan(double)}. 
+ * Given a single data atom it Returns the tangent of the argument.
+ *
+ */
+public class TAN extends DoubleBase{
+	Double compute(Double input){
+		return Math.tan(input);
+	}
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/TANH.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/TANH.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/TANH.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/TANH.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+/**
+ * TANH implements a binding to the Java function
+ * {@link java.lang.Math#tanh(double) Math.tanh(double)}. 
+ * Given a single data atom it Returns the hyperbolic tangent 
+ * of the argument.
+ *
+ */
+public class TANH extends DoubleBase{
+	Double compute(Double input){
+		return Math.tanh(input);
+	}
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/TOBAG.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/TOBAG.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/TOBAG.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/TOBAG.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+
+import java.io.IOException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.BagFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+
+/**
+ * This class takes a list of items and puts them into a bag
+ * T = foreach U generate TOBAG($0, $1, $2);
+ * It's like saying this:
+ * T = foreach U generate {($0), ($1), ($2)}
+ */
+public class TOBAG extends EvalFunc<DataBag> {
+
+    @Override
+    public DataBag exec(Tuple input) throws IOException {
+        try {
+            DataBag bag = BagFactory.getInstance().newDefaultBag();
+
+            for (int i = 0; i < input.size(); ++i) {
+                final Object object = input.get(i);
+                if (object != null) {
+                    Tuple tp2 = TupleFactory.getInstance().newTuple(1);
+                    tp2.set(0, object);
+                    bag.add(tp2);
+                }
+            }
+
+            return bag;
+        } catch (Exception ee) {
+            throw new RuntimeException("Error while creating a bag", ee);
+        }
+    }
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/TOP.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/TOP.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/TOP.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/TOP.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Random;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.pig.Algebraic;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.data.BagFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
+
+/**
+ * Top UDF accepts a bag of tuples and returns top-n tuples depending upon the
+ * tuple field value of type long. Both n and field number needs to be provided
+ * to the UDF. The UDF iterates through the input bag and just retains top-n
+ * tuples by storing them in a priority queue of size n+1 where priority is the
+ * long field. This is efficient as priority queue provides constant time - O(1)
+ * removal of the least element and O(log n) time for heap restructuring. The
+ * UDF is especially helpful for turning the nested grouping operation inside
+ * out and retaining top-n in a nested group. 
+ * 
+ * Assumes all tuples in the bag contain an element of the same type in the compared column.
+ * 
+ * Sample usage: 
+ * A = LOAD 'test.tsv' as (first: chararray, second: chararray); 
+ * B = GROUP A BY (first, second);
+ * C = FOREACH B generate FLATTEN(group), COUNT(*) as count;
+ * D = GROUP C BY first; // again group by first 
+ * topResults = FOREACH D { 
+ *          result = Top(10, 2, C); // and retain top 10 occurrences of 'second' in first 
+ *          GENERATE FLATTEN(result); 
+ *  }
+ */
+public class TOP extends EvalFunc<DataBag> implements Algebraic{
+    private static final Log log = LogFactory.getLog(TOP.class);
+    static BagFactory mBagFactory = BagFactory.getInstance();
+    static TupleFactory mTupleFactory = TupleFactory.getInstance();
+    private Random randomizer = new Random();
+
+    static class TupleComparator implements Comparator<Tuple> {
+        private final int fieldNum;
+        private byte datatype;
+        private boolean typeFound=false;
+
+        public TupleComparator(int fieldNum) {
+            this.fieldNum = fieldNum;
+        }
+
+        /*          
+         * (non-Javadoc)
+         * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+         */
+        @Override
+        public int compare(Tuple o1, Tuple o2) {
+            if (o1 == null)
+                return -1;
+            if (o2 == null)
+                return 1;
+            try {
+                Object field1 = o1.get(fieldNum);
+                Object field2 = o2.get(fieldNum);
+                if (!typeFound) {
+                    datatype = DataType.findType(field1);
+                    typeFound = true;
+                }
+                return DataType.compare(field1, field2, datatype, datatype);
+            } catch (ExecException e) {
+                throw new RuntimeException("Error while comparing o1:" + o1
+                        + " and o2:" + o2, e);
+            }
+        }
+    }
+
+    @Override
+    public DataBag exec(Tuple tuple) throws IOException {
+        if (tuple == null || tuple.size() < 3) {
+            return null;
+        }
+        try {
+            int n = (Integer) tuple.get(0);
+            int fieldNum = (Integer) tuple.get(1);
+            DataBag inputBag = (DataBag) tuple.get(2);
+            PriorityQueue<Tuple> store = new PriorityQueue<Tuple>(n + 1,
+                    new TupleComparator(fieldNum));
+            updateTop(store, n, inputBag);
+            DataBag outputBag = mBagFactory.newDefaultBag();
+            for (Tuple t : store) {
+                outputBag.add(t);
+            }
+            if (log.isDebugEnabled()) {
+                if (randomizer.nextInt(1000) == 1) {
+                    log.debug("outputting a bag: ");
+                    for (Tuple t : outputBag) 
+                        log.debug("outputting "+t.toDelimitedString("\t"));
+                    log.debug("==================");
+                }
+            }
+            return outputBag;
+        } catch (ExecException e) {
+            throw new RuntimeException("ExecException executing function: ", e);
+        } catch (Exception e) {
+            throw new RuntimeException("General Exception executing function: " + e);
+        }
+    }
+
+    protected static void updateTop(PriorityQueue<Tuple> store, int limit, DataBag inputBag) {
+        Iterator<Tuple> itr = inputBag.iterator();
+        while (itr.hasNext()) {
+            Tuple t = itr.next();
+            store.add(t);
+            if (store.size() > limit)
+                store.poll();
+        }
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+     */
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        List<FieldSchema> fields = new ArrayList<FieldSchema>(3);
+        fields.add(new Schema.FieldSchema(null, DataType.INTEGER));
+        fields.add(new Schema.FieldSchema(null, DataType.INTEGER));
+        fields.add(new Schema.FieldSchema(null, DataType.BAG));
+        FuncSpec funcSpec = new FuncSpec(this.getClass().getName(), new Schema(fields));
+        List<FuncSpec> funcSpecs = new ArrayList<FuncSpec>(1);
+        funcSpecs.add(funcSpec);
+        return funcSpecs;
+    }
+
+    @Override
+    public Schema outputSchema(Schema input) {
+        try {
+            if (input.size() < 3) {
+                return null;
+            }
+            Schema.FieldSchema bagFs = new Schema.FieldSchema(null,
+                    input.getField(2).schema, DataType.BAG);
+            return new Schema(bagFs);
+
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    @Override
+    public String getInitial() {
+        return Initial.class.getName();
+    }
+
+    @Override
+    public String getIntermed() {
+        return Intermed.class.getName();
+    }
+
+    @Override
+    public String getFinal() {
+        return Final.class.getName();
+    }
+
+    /*
+     * Same as normal code-path exec, but outputs a Tuple with the schema
+     * <Int, Int, DataBag> -- same schema as expected input.
+     */
+    static public class Initial extends EvalFunc<Tuple> {
+        //private static final Log log = LogFactory.getLog(Initial.class);
+        //private final Random randomizer = new Random();
+        @Override
+        public Tuple exec(Tuple tuple) throws IOException {
+            if (tuple == null || tuple.size() < 3) {
+                return null;
+            }
+            
+            try {
+                int n = (Integer) tuple.get(0);
+                int fieldNum = (Integer) tuple.get(1);
+                DataBag inputBag = (DataBag) tuple.get(2);
+                Tuple retTuple = mTupleFactory.newTuple(3);
+                DataBag outputBag = mBagFactory.newDefaultBag();
+                // initially, there should only be one, so not much point in doing the priority queue
+                for (Tuple t : inputBag) {
+                    outputBag.add(t);
+                }
+                retTuple.set(0, n);
+                retTuple.set(1,fieldNum);
+                retTuple.set(2, outputBag);               
+                return retTuple;
+            } catch (Exception e) {
+                throw new RuntimeException("General Exception executing function: " + e);
+            }
+        }
+    }
+
+    static public class Intermed extends EvalFunc<Tuple> {
+        private static final Log log = LogFactory.getLog(Intermed.class);
+        private final Random randomizer = new Random();
+        /* The input is a tuple that contains a single bag.
+         * This bag contains outputs of the Initial step --
+         * tuples of the format (limit, index, { top_tuples })
+         * 
+         * We need to take the top of tops and return a similar tuple.
+         * 
+         * (non-Javadoc)
+         * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
+         */
+        @Override
+        public Tuple exec(Tuple input) throws IOException {
+            if (input == null || input.size() < 1) {
+                return null;
+            }
+            try {
+                DataBag bagOfIntermediates = (DataBag) input.get(0);
+                Iterator<Tuple> intermediateIterator = bagOfIntermediates.iterator();
+                if (!intermediateIterator.hasNext()) {
+                    return null;
+                }
+                Tuple peekTuple = intermediateIterator.next();
+                if (peekTuple == null || peekTuple.size() < 3 ) return null;
+                int n = (Integer) peekTuple.get(0);
+                int fieldNum = (Integer) peekTuple.get(1);
+                DataBag inputBag = (DataBag) peekTuple.get(2);
+
+                PriorityQueue<Tuple> store = new PriorityQueue<Tuple>(n + 1,
+                        new TupleComparator(fieldNum));
+
+                updateTop(store, n, inputBag);
+
+                while (intermediateIterator.hasNext()) {
+                    Tuple t = intermediateIterator.next();
+                    if (t == null || t.size() < 3 ) continue;
+                    updateTop(store, n, (DataBag) t.get(2));
+                }   
+
+                DataBag outputBag = mBagFactory.newDefaultBag();
+                for (Tuple t : store) {
+                    outputBag.add(t);
+                }
+                Tuple retTuple = mTupleFactory.newTuple(3);
+                retTuple.set(0, n);
+                retTuple.set(1,fieldNum);
+                retTuple.set(2, outputBag);
+                if (log.isDebugEnabled()) { 
+                    if (randomizer.nextInt(1000) == 1) log.debug("outputting "+retTuple.toDelimitedString("\t")); 
+                }
+                return retTuple;
+            } catch (ExecException e) {
+                throw new RuntimeException("ExecException executing function: ", e);
+            } catch (Exception e) {
+                throw new RuntimeException("General Exception executing function: " + e);
+            }
+        }
+        
+    }
+    
+    static public class Final extends EvalFunc<DataBag> {
+
+        private static final Log log = LogFactory.getLog(Final.class);
+        private final Random randomizer = new Random();
+
+
+
+        /*
+         * The input to this function is a tuple that contains a single bag.
+         * This bag, in turn, contains outputs of the Intermediate step -- 
+         * tuples of the format (limit, index, { top_tuples } )
+         * 
+         * we want to return a bag of top tuples
+         * 
+         * (non-Javadoc)
+         * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
+         */
+        @Override
+        public DataBag exec(Tuple tuple) throws IOException {
+            if (tuple == null || tuple.size() < 1) {
+                return null;
+            }
+            try {
+                DataBag bagOfIntermediates = (DataBag) tuple.get(0);
+                Iterator<Tuple> intermediateIterator = bagOfIntermediates.iterator();
+                if (!intermediateIterator.hasNext()) {
+                    return null;
+                }
+                Tuple peekTuple = intermediateIterator.next();
+                if (peekTuple == null || peekTuple.size() < 3 ) return null;
+                int n = (Integer) peekTuple.get(0);
+                int fieldNum = (Integer) peekTuple.get(1);
+                DataBag inputBag = (DataBag) peekTuple.get(2);
+
+                PriorityQueue<Tuple> store = new PriorityQueue<Tuple>(n + 1,
+                        new TupleComparator(fieldNum));
+
+                updateTop(store, n, inputBag);
+
+                while (intermediateIterator.hasNext()) {
+                    Tuple t = intermediateIterator.next();
+                    if (t == null || t.size() < 3 ) continue;
+                    updateTop(store, n, (DataBag) t.get(2));
+                }   
+
+                DataBag outputBag = mBagFactory.newDefaultBag();
+                for (Tuple t : store) {
+                    outputBag.add(t);
+                }
+                if (log.isDebugEnabled()) {
+                    if (randomizer.nextInt(1000) == 1) for (Tuple t : outputBag) log.debug("outputting "+t.toDelimitedString("\t"));
+                }
+                return outputBag;
+            } catch (ExecException e) {
+                throw new RuntimeException("ExecException executing function: ", e);
+            } catch (Exception e) {
+                throw new RuntimeException("General Exception executing function: " + e);
+            }
+        }
+    }
+}
+

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/TOTUPLE.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/TOTUPLE.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/TOTUPLE.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/TOTUPLE.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * This class makes a tuple out of the parameter
+ * T = foreach U generate TOTUPLE($0, $1, $2);
+ * It generates a tuple containing $0, $1, and $2
+ */
+public class TOTUPLE extends EvalFunc<Tuple> {
+
+    @Override
+    public Tuple exec(Tuple input) throws IOException {
+        try {
+            List<Object> items = new ArrayList<Object>();
+            for (int i = 0; i < input.size(); ++i) {
+                items.add(input.get(i));
+            }
+            return TupleFactory.getInstance().newTuple(items);
+        } catch (Exception e) {
+            throw new RuntimeException("Error while creating a tuple", e);
+        }
+    }
+
+    @Override
+    public Schema outputSchema(Schema input) {
+        try {
+            Schema tupleSchema = new Schema();
+            for (int i = 0; i < input.size(); ++i) {
+                tupleSchema.add(input.getField(i));
+            }
+            return new Schema(new Schema.FieldSchema(getSchemaName(this
+                    .getClass().getName().toLowerCase(), input), tupleSchema,
+                    DataType.TUPLE));
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/TRIM.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/TRIM.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/TRIM.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/TRIM.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/** 
+ * Returns a string, with leading and trailing whitespace omitted. 
+ * Implements a binding to the Java function {@link java.lang.String#trim() String.trim()}. 
+ */
+public class TRIM extends EvalFunc<String> {
+    @Override
+    public String exec(Tuple input) throws IOException {
+        if (input == null || input.size() == 0) {
+            return null;
+        }
+        try {
+            String str = (String) input.get(0);
+            if (str == null) return null;
+            if (str.length() == 0) return str;
+            return str.trim();
+        } catch (ExecException e) {
+            log.warn("Error reading input: " + e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    public Schema outputSchema(Schema input) {
+        return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
+    }
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/UCFIRST.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/UCFIRST.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/UCFIRST.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/UCFIRST.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/** 
+ * upper-case the first character of a string 
+ */
+public class UCFIRST extends EvalFunc<String> {
+    @Override
+    public String exec(Tuple input) throws IOException {
+        if (input == null || input.size() == 0) {
+            return null;
+        }
+        try {
+            String str = (String) input.get(0);
+            if (str == null) return null;
+            if (str.length() == 0) return str;
+            return Character.toUpperCase(str.charAt(0))+str.substring(1);
+        } catch (ExecException e) {
+            log.warn("Error reading input: " + e.getMessage());
+            return null;
+        }
+    }
+
+    @Override
+    public Schema outputSchema(Schema input) {
+        return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
+    }
+}

Added: hadoop/pig/trunk/src/org/apache/pig/builtin/UPPER.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/builtin/UPPER.java?rev=957100&view=auto
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/builtin/UPPER.java (added)
+++ hadoop/pig/trunk/src/org/apache/pig/builtin/UPPER.java Wed Jun 23 03:37:11 2010
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.PigWarning;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.DataType;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.FuncSpec;
+
+
+/**
+ * UPPER implements eval function to convert a string to upper case
+ * Example:
+ *      A = load 'mydata' as (name);
+ *      B = foreach A generate UPPER(name);
+ */
+public class UPPER extends EvalFunc<String> {
+
+    /** 
+     * Upper-cases an input string.
+     * @param input tuple; first column is assumed to have the column to convert
+     * 
+     * @exception IOException
+     */
+    public String exec(Tuple input) throws IOException {
+        if (input == null || input.size() == 0 || input.get(0) == null)
+            return null;
+
+        String str = null;
+        try {
+            str = (String)input.get(0);
+            return str.toUpperCase();
+        }
+        catch (ClassCastException e) {
+            warn("unable to cast input "+input.get(0)+" of class "+
+                    input.get(0).getClass()+" to String", PigWarning.UDF_WARNING_1);
+            return null;
+        }
+        catch(Exception e){
+            warn("Error processing input "+input.get(0), PigWarning.UDF_WARNING_1);
+            return null;
+        }
+    }
+
+    /**
+     * This method gives a name to the column. 
+     * @param input - schema of the input data
+     * @return schema of the input data
+     */
+    @Override
+    public Schema outputSchema(Schema input) {
+        return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
+    }
+
+    /* (non-Javadoc)
+     * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+     */
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+        funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
+
+        return funcList;
+    }
+
+}

Modified: hadoop/pig/trunk/test/org/apache/pig/test/TestBuiltin.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestBuiltin.java?rev=957100&r1=957099&r2=957100&view=diff
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/TestBuiltin.java (original)
+++ hadoop/pig/trunk/test/org/apache/pig/test/TestBuiltin.java Wed Jun 23 03:37:11 2010
@@ -17,12 +17,19 @@
  */
 package org.apache.pig.test;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
+import java.lang.reflect.Method;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Map;
+import java.util.Random;
+import java.util.Set;
 
 import org.apache.pig.Algebraic;
 import org.apache.pig.EvalFunc;
@@ -34,18 +41,35 @@ import org.apache.pig.backend.hadoop.dat
 import org.apache.pig.builtin.ARITY;
 import org.apache.pig.builtin.BagSize;
 import org.apache.pig.builtin.CONCAT;
+import org.apache.pig.builtin.COR;
 import org.apache.pig.builtin.COUNT;
 import org.apache.pig.builtin.COUNT_STAR;
+import org.apache.pig.builtin.COV;
 import org.apache.pig.builtin.DIFF;
 import org.apache.pig.builtin.Distinct;
+import org.apache.pig.builtin.INDEXOF;
+import org.apache.pig.builtin.LAST_INDEX_OF;
+import org.apache.pig.builtin.LCFIRST;
+import org.apache.pig.builtin.LOWER;
 import org.apache.pig.builtin.MapSize;
 import org.apache.pig.builtin.PigStorage;
+import org.apache.pig.builtin.REGEX_EXTRACT;
+import org.apache.pig.builtin.REGEX_EXTRACT_ALL;
+import org.apache.pig.builtin.REPLACE;
 import org.apache.pig.builtin.SIZE;
+import org.apache.pig.builtin.SPLIT;
+import org.apache.pig.builtin.SUBSTRING;
 import org.apache.pig.builtin.StringConcat;
 import org.apache.pig.builtin.StringSize;
+import org.apache.pig.builtin.TOBAG;
 import org.apache.pig.builtin.TOKENIZE;
+import org.apache.pig.builtin.TOP;
+import org.apache.pig.builtin.TOTUPLE;
+import org.apache.pig.builtin.TRIM;
 import org.apache.pig.builtin.TextLoader;
 import org.apache.pig.builtin.TupleSize;
+import org.apache.pig.builtin.UCFIRST;
+import org.apache.pig.builtin.UPPER;
 import org.apache.pig.data.BagFactory;
 import org.apache.pig.data.DataBag;
 import org.apache.pig.data.DataByteArray;
@@ -1106,6 +1130,413 @@ public class TestBuiltin {
 
     }
     
+    @Test
+    public void testMathFuncs() throws Exception {
+        Random generator = new Random();
+        generator.setSeed(System.currentTimeMillis());
+        Double delta = 0.1;
+        // We assume that UDFs are stored in org.apache.pig.builtin
+        // Change this test case if we add more hierarchy later\
+        // Also, we assume that we have a function with math function 
+        // associated with these UDF with a lowercase name
+        String[] mathFuncs = {
+                "SIN",
+                "SINH",
+                "ASIN",
+                "COS",
+                "COSH",
+                "ACOS",
+                "TAN",
+                "TANH",
+                "ATAN",
+                "LOG",
+                "LOG10",
+                "SQRT",
+                "CEIL",
+                "EXP",
+                "FLOOR",
+                "CBRT"
+        };
+        String udfPackage = "org.apache.pig.builtin.";
+        //String[] mathNonStdFuncs = {};
+        EvalFunc<Double> evalFunc;
+        Tuple tup;
+        Double input, actual, expected;
+        Method mathMethod;
+        String msg;
+        for(String func: mathFuncs) {
+            evalFunc = (EvalFunc<Double>) Class.forName(udfPackage + func).newInstance();
+            tup = DefaultTupleFactory.getInstance().newTuple(1);
+            // double value between 0.0 and 1.0 
+            input = generator.nextDouble();
+            tup.set(0, input);
+            mathMethod = Math.class.getDeclaredMethod(func.toLowerCase(), double.class);
+            actual = evalFunc.exec(tup);
+            expected = (Double)mathMethod.invoke(null, input);
+            msg = "[Testing " + func + " on input: " + input + " ( (actual) " + actual + " == " + expected + " (expected) )]";
+            assertEquals(msg, actual, expected, delta);
+        }
+    }
+    
+    @Test
+    public void testStringFuncs() throws Exception {
+        // Since String functions are trivial we add test on per case basis
+        String inputStr = "Hello World!";
+        String inputStrLower = "hello world!";
+        String inputStrUpper = "HELLO WORLD!";
+        String inputStrCamel = "hello World!";
+        String inputStroWitha = "Hella Warld!";
+        String inpuStrExtra = "Hello World!   ";
+        
+        List<Object> l = new LinkedList<Object>();
+        l.add(inputStr);
+        l.add("o");
+        
+        String expected = null;
+        Tuple input;
+        String output;
+        Integer intOutput;
+        EvalFunc<String> strFunc;
+        EvalFunc<Integer> intFunc;
+        
+        strFunc = new LCFIRST();
+        input = DefaultTupleFactory.getInstance().newTuple(inputStr);
+        expected = inputStrCamel;
+        output = strFunc.exec(input);
+        assertTrue(output.equals(expected));
+        
+        strFunc = new UCFIRST();
+        input = DefaultTupleFactory.getInstance().newTuple(inputStrCamel);
+        expected = inputStr;
+        output = strFunc.exec(input);
+        assertTrue(output.equals(expected));
+         
+        intFunc = new LAST_INDEX_OF();
+        input = DefaultTupleFactory.getInstance().newTuple(l);
+        intOutput = intFunc.exec(input);
+        assertTrue(intOutput.intValue()==7);
+        
+        intFunc = new INDEXOF();
+        input = DefaultTupleFactory.getInstance().newTuple(l);
+        intOutput = intFunc.exec(input);
+        assertTrue(intOutput.intValue()==4);
+        
+        strFunc = new UPPER();
+        input = DefaultTupleFactory.getInstance().newTuple(inputStr);
+        expected = inputStrUpper;
+        output = strFunc.exec(input);
+        assertTrue(output.equals(expected));
+        
+        strFunc = new LOWER();
+        input = DefaultTupleFactory.getInstance().newTuple(inputStr);
+        expected = inputStrLower;
+        output = strFunc.exec(input);
+        assertTrue(output.equals(expected));
+        
+        strFunc = new REPLACE();
+        l.clear();
+        l.add(inputStr);
+        l.add("o");
+        l.add("a");
+        input = DefaultTupleFactory.getInstance().newTuple(l);
+        expected = inputStroWitha;
+        output = strFunc.exec(input);
+        assertTrue(output.equals(expected));
+        
+        strFunc = new SUBSTRING();
+        l.clear();
+        l.add(inputStr);
+        l.add(1);
+        l.add(5);
+        input = DefaultTupleFactory.getInstance().newTuple(l);
+        expected = "ello";
+        output = strFunc.exec(input);
+        assertTrue(output.equals(expected));
+        
+        strFunc = new TRIM();
+        input = DefaultTupleFactory.getInstance().newTuple(inpuStrExtra);
+        expected = inputStr;
+        output = strFunc.exec(input);
+        assertTrue(output.equals(expected));
+        
+        SPLIT splitter = new SPLIT();
+        Tuple test1 = TupleFactory.getInstance().newTuple(1);
+        Tuple test2 = TupleFactory.getInstance().newTuple(2);
+        Tuple test3 = TupleFactory.getInstance().newTuple(3);
+        
+        test2.set(0, "foo");
+        test2.set(1, ":");
+        Tuple splits = splitter.exec(test2);
+        assertEquals("no matches should return tuple with original string", 1, splits.size());
+        assertEquals("no matches should return tuple with original string", "foo", 
+                splits.get(0));
+            
+        // test default delimiter
+        test1.set(0, "f ooo bar");
+        splits = splitter.exec(test1);
+        assertEquals("split on default value ", 3, splits.size());
+        assertEquals("f", splits.get(0));
+        assertEquals("ooo", splits.get(1));
+        assertEquals("bar", splits.get(2));
+        
+        // test trimming of whitespace
+        test1.set(0, "foo bar  ");
+        splits = splitter.exec(test1);
+        assertEquals("whitespace trimmed if no length arg", 2, splits.size());
+        
+        // test forcing null matches with length param
+        test3.set(0, "foo bar   ");
+        test3.set(1, "\\s");
+        test3.set(2, 10);
+        splits = splitter.exec(test3);
+        assertEquals("length forces empty string matches on end", 5, splits.size());
+        
+        // test limiting results with limit
+        test3.set(0, "foo:bar:baz");
+        test3.set(1, ":");
+        test3.set(2, 2);
+        splits = splitter.exec(test3);
+        assertEquals(2, splits.size());
+        assertEquals("foo", splits.get(0));
+        assertEquals("bar:baz", splits.get(1));
+        
+        Tuple t1 = TupleFactory.getInstance().newTuple(3);
+        t1.set(0, "/search/iy/term1/test");
+        t1.set(1, "^\\/search\\/iy\\/(.*?)\\/.*");
+        t1.set(2, 1);
+        
+        Tuple t2 = TupleFactory.getInstance().newTuple(3);
+        t2.set(0, "/search/iy/term1/test");
+        t2.set(1, "^\\/search\\/iy\\/(.*?)\\/.*");
+        t2.set(2, 2);
+        
+        Tuple t3 = TupleFactory.getInstance().newTuple(3);
+        t3.set(0, null);
+        t3.set(1, "^\\/search\\/iy\\/(.*?)\\/.*");
+        t3.set(2, 2);
+        
+        REGEX_EXTRACT func = new REGEX_EXTRACT();
+        String r = func.exec(t1);
+        assertTrue(r.equals("term1"));
+        r = func.exec(t2);
+        assertTrue(r==null);
+        r = func.exec(t3);
+        assertTrue(r==null);
+        
+        String matchRegex = "^(.+)\\b\\s+is a\\s+\\b(.+)$";
+        TupleFactory tupleFactory = TupleFactory.getInstance();
+        Tuple te1 = tupleFactory.newTuple(2);
+        te1.set(0,"this is a match");
+        te1.set(1, matchRegex);
+        
+        Tuple te2 = tupleFactory.newTuple(2);
+        te2.set(0, "no match");
+        te2.set(1, matchRegex);
+        
+        Tuple te3 = tupleFactory.newTuple(2);
+        te3.set(0, null);
+        te3.set(1, matchRegex);
+     
+        REGEX_EXTRACT_ALL funce = new REGEX_EXTRACT_ALL();
+        Tuple re = funce.exec(te1);
+        assertEquals(re.size(), 2);
+        assertEquals("this", re.get(0));
+        assertEquals("match", re.get(1));
+        
+        re = funce.exec(te2);
+        assertTrue(re==null);
+        
+        re = funce.exec(te3);
+        assertTrue(re==null);
+    }
+    
+    @Test
+    public void testStatsFunc() throws Exception {
+        COV cov = new COV("a","b");
+        DataBag dBag = DefaultBagFactory.getInstance().newDefaultBag();
+        Tuple tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 1.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 4.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 8.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 4.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 7.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 8.0);
+        dBag.add(tup1);
+        DataBag dBag1 = DefaultBagFactory.getInstance().newDefaultBag();
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 2.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 2.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 3.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 3.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 2.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 4.0);
+        dBag1.add(tup1);
+        Tuple input = DefaultTupleFactory.getInstance().newTuple(2);
+        input.set(0, dBag);
+        input.set(1, dBag1);
+        DataBag output = cov.exec(input);
+        Iterator<Tuple> it = output.iterator();
+        Tuple ans = (Tuple)it.next();
+        assertEquals((String)ans.get(0),"a");
+        assertEquals((String)ans.get(1),"b");
+        assertEquals(1.11111, (Double)ans.get(2),0.0005);
+        
+        COR cor = new COR("a","b");
+        dBag = DefaultBagFactory.getInstance().newDefaultBag();
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 1.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 4.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 8.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 4.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 7.0);
+        dBag.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 8.0);
+        dBag.add(tup1);
+        dBag1 = DefaultBagFactory.getInstance().newDefaultBag();
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 2.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 2.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 3.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 3.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 2.0);
+        dBag1.add(tup1);
+        tup1 = DefaultTupleFactory.getInstance().newTuple(1);
+        tup1.set(0, 4.0);
+        dBag1.add(tup1);
+        input = DefaultTupleFactory.getInstance().newTuple(2);
+        input.set(0, dBag);
+        input.set(1, dBag1);
+        output = cor.exec(input);
+        it = output.iterator();
+        ans = (Tuple) it.next();
+        assertEquals((String)ans.get(0),"a");
+        assertEquals((String)ans.get(1),"b");
+        assertEquals(0.582222509739582, (Double)ans.get(2) ,0.0005);
+    }
+    
+    private void checkItemsGT(Iterable<Tuple> tuples, int field, int limit) throws ExecException {
+        for (Tuple t : tuples) {
+            Long val = (Long) t.get(field);
+            assertTrue("Value "+ val + " exceeded the expected limit", val > limit);
+        }
+    }
+
+    @Test
+    public void testMiscFunc() throws Exception {
+        TOBAG tb = new TOBAG();
+
+        Tuple input = TupleFactory.getInstance().newTuple();
+        for (int i = 0; i < 100; ++i) {
+            input.append(i);
+        }
+
+        Set<Integer> s = new HashSet<Integer>();
+        DataBag db = tb.exec(input);
+        for (Tuple t : db) {
+            s.add((Integer) t.get(0));
+        }
+
+        // finally check the bag had everything we put in the tuple.
+        assertEquals(100, s.size());
+        for (int i = 0; i < 100; ++i) {
+            assertTrue(s.contains(i));
+        }
+        
+        TOTUPLE tt = new TOTUPLE();
+
+        input = TupleFactory.getInstance().newTuple();
+        for (int i = 0; i < 100; ++i) {
+            input.append(i);
+        }
+
+        Tuple output = tt.exec(input);
+        assertTrue(!(input == output));
+        assertEquals(input, output);
+        
+        TOP top = new TOP();
+        TupleFactory tupleFactory = DefaultTupleFactory.getInstance();
+        BagFactory bagFactory = DefaultBagFactory.getInstance();
+        Tuple inputTuple = tupleFactory.newTuple(3);
+        DataBag dBag = bagFactory.newDefaultBag();
+        
+        // set N = 10 i.e retain top 10 tuples
+        inputTuple.set(0, 10);
+        // compare tuples by field number 1
+        inputTuple.set(1, 1);
+        // set the data bag containing the tuples
+        inputTuple.set(2, dBag);
+
+        // generate tuples of the form (group-1, 1), (group-2, 2) ...
+        for (long i = 0; i < 100; i++) {
+            Tuple nestedTuple = tupleFactory.newTuple(2);
+            nestedTuple.set(0, "group-" + i);
+            nestedTuple.set(1, i);
+            dBag.add(nestedTuple);
+        }
+        
+        DataBag outBag = top.exec(inputTuple);
+        assertEquals(outBag.size(), 10L);
+        checkItemsGT(outBag, 1, 89);
+        
+        // two initial results
+        Tuple init1 = (new TOP.Initial()).exec(inputTuple);
+        Tuple init2 = (new TOP.Initial()).exec(inputTuple);
+        // two intermediate results
+
+        DataBag intermedBag = bagFactory.newDefaultBag();
+        intermedBag.add(init1);
+        intermedBag.add(init2);
+        Tuple intermedInput = tupleFactory.newTuple(intermedBag);
+        Tuple intermedOutput1 = (new TOP.Intermed()).exec(intermedInput);
+        Tuple intermedOutput2 = (new TOP.Intermed()).exec(intermedInput);
+        checkItemsGT((DataBag)intermedOutput1.get(2), 1, 94);
+
+        // final result
+        DataBag finalInputBag = bagFactory.newDefaultBag();
+        finalInputBag.add(intermedOutput1);
+        finalInputBag.add(intermedOutput2);
+        Tuple finalInput = tupleFactory.newTuple(finalInputBag);
+        outBag = (new TOP.Final()).exec(finalInput);
+        assertEquals(outBag.size(), 10L);
+        checkItemsGT(outBag, 1, 96);
+    }
     
     @Test
     public void testDistinct() throws Exception {
@@ -1368,7 +1799,7 @@ public class TestBuiltin {
         assertTrue(f3.size() == arity3);
         Util.deleteFile(cluster, "input.txt");
     }
-
+    
     /*
     @Test
     public void testLFBin() throws Exception {