You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by dv...@apache.org on 2010/03/09 07:28:35 UTC

svn commit: r920710 - in /hadoop/pig/trunk: ./ contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/ contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/

Author: dvryaboy
Date: Tue Mar  9 06:28:35 2010
New Revision: 920710

URL: http://svn.apache.org/viewvc?rev=920710&view=rev
Log:
PIG-1248: [piggybank] some useful String functions

Modified:
    hadoop/pig/trunk/CHANGES.txt
    hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java
    hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java
    hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java
    hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java
    hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java
    hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java
    hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java

Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Tue Mar  9 06:28:35 2010
@@ -66,6 +66,8 @@ manner (rding via pradeepkth)
 
 IMPROVEMENTS
 
+PIG-1248: [piggybank] some useful String functions (dvryaboy)
+
 PIG-1251: Move SortInfo calculation earlier in compilation (ashutoshc)
 
 PIG-1233: NullPointerException in AVG  (ankur via olgan)

Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java Tue Mar  9 06:28:35 2010
@@ -19,47 +19,50 @@
 package org.apache.pig.piggybank.evaluation.string;
 
 import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
 
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.pig.EvalFunc;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.DataType;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.FuncSpec;
 
 
 /**
- * string.INSTR implements eval function to search for a string
+ * string.INDEXOF implements eval function to search for a string
  * Example:
  *      register pigudfs.jar;
  *      A = load 'mydata' as (name);
  *      B = foreach A generate string.INDEXOF(name, ",");
  *      dump B;
  */
-public class INDEXOF extends EvalFunc<Integer>
-{
+public class INDEXOF extends EvalFunc<Integer> {
+    
+    private static final Log log = LogFactory.getLog(INDEXOF.class);
+
     /**
      * Method invoked on every tuple during foreach evaluation
      * @param input tuple; first column is assumed to have the column to convert
      *                     the second column is the string we search for
      *                     the third is an optional column from where to start the search
      * @exception java.io.IOException
+     * @return index of first occurrence, or null in case of processing error
      */
     public Integer exec(Tuple input) throws IOException {
-        if (input == null || input.size() == 0)
+        if (input == null || input.size() < 2) {
+            log.warn("invalid input tuple: "+input);
             return null;
-
-        try{
+        }
+        try {
             String str = (String)input.get(0);
             String search = (String)input.get(1);
             int fromIndex = 0;
-            if (input.size() ==3)
-                fromIndex = (Integer)input.get(1);
+            if (input.size() >=3)
+                fromIndex = (Integer)input.get(2);
             return str.indexOf(search, fromIndex);
-        }catch(Exception e){
-            System.err.println("Failed to process input; error - " + e.getMessage());
+        } catch(Exception e){
+            log.warn("Failed to process input; error - " + e.getMessage());
             return null;
         }
     }

Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java Tue Mar  9 06:28:35 2010
@@ -19,42 +19,47 @@
 package org.apache.pig.piggybank.evaluation.string;
 
 import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.pig.EvalFunc;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.DataType;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.FuncSpec;
-
 
 /**
- * string.INSTR implements eval function to search for the last occurance of a string
+ * string.INSTR implements eval function to search for the last occurrence of a string<br>
+ * Returns null on error<br>
  * Example:
+ * <code>
  *      register pigudfs.jar;
  *      A = load 'mydata' as (name);
  *      B = foreach A generate string.LASTINDEXOF(name, ",");
  *      dump B;
+ * </code>
  */
-public class LASTINDEXOF extends EvalFunc<Integer>
-{
+public class LASTINDEXOF extends EvalFunc<Integer> {
+    private static final Log log = LogFactory.getLog(LASTINDEXOF.class);
+
     /**
-     * Method invoked on every tuple during foreach evaluation
-     * @param input tuple; first column is assumed to have the column to convert
+     * Finds the last location of a substring in a given string.
+     * @param input tuple:<ol>
+     * <li>the string to process
+     * <li>the substring to find
+     * </ol>
      * @exception java.io.IOException
+     * @return last location of substring, or null in case of processing errors.
      */
     public Integer exec(Tuple input) throws IOException {
-        if (input == null || input.size() == 0)
+        if (input == null || input.size() < 2)
             return null;
 
-        try{
+        try {
             String str = (String)input.get(0);
             String search = (String)input.get(1);
             return str.lastIndexOf(search);
-        }catch(Exception e){
-            System.err.println("Failed to process input; error - " + e.getMessage());
+        } catch(Exception e) {
+            log.warn("Failed to process input; error - " + e.getMessage());
             return null;
         }
     }

Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java Tue Mar  9 06:28:35 2010
@@ -38,8 +38,8 @@ import org.apache.pig.FuncSpec;
  *      B = foreach A generate string.LOWER(name);
  *      dump B;
  */
-public class LOWER extends EvalFunc<String>
-{
+public class LOWER extends EvalFunc<String> {
+
     /**
      * Method invoked on every tuple during foreach evaluation
      * @param input tuple; first column is assumed to have the column to convert
@@ -49,21 +49,21 @@ public class LOWER extends EvalFunc<Stri
         if (input == null || input.size() == 0)
             return null;
 
-        try{
+        try {
             String str = (String)input.get(0);
             return str.toLowerCase();
-        }catch(Exception e){
-            System.err.println("Failed to process input; error - " + e.getMessage());
+        } catch(Exception e){
+            log.warn("Failed to process input; error - " + e.getMessage());
             return null;
         }
     }
 
-    //@Override
     /**
      * This method gives a name to the column.
      * @param input - schema of the input data
      * @return schema of the input data
      */
+    @Override
     public Schema outputSchema(Schema input) {
         return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
     }

Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java Tue Mar  9 06:28:35 2010
@@ -19,24 +19,25 @@
 package org.apache.pig.piggybank.evaluation.string;
 
 import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
 
 import org.apache.pig.EvalFunc;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.DataType;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.FuncSpec;
 
 
 /**
- * string.REPLACE implements eval function to replace part ofa string.
- * Example:
+ * string.REPLACE implements eval function to replace part of a string.
+ * Example:<code>
  *      register pigudfs.jar;
  *      A = load 'mydata' as (name);
  *      B = foreach A generate string.REPLACE(name, 'blabla', 'bla');
  *      dump B;
+ *      </code>
+ * The first argument is a string on which to perform the operation. The second argument
+ * is treated as a regular expression. The third argument is the replacement string.
+ * This is a wrapper around Java's String.replaceAll(String, String);
+ * 
  */
 public class REPLACE extends EvalFunc<String>
 {
@@ -46,7 +47,7 @@ public class REPLACE extends EvalFunc<St
      * @exception java.io.IOException
      */
     public String exec(Tuple input) throws IOException {
-        if (input == null || input.size() == 0)
+        if (input == null || input.size() < 3)
             return null;
 
         try{
@@ -55,21 +56,11 @@ public class REPLACE extends EvalFunc<St
             String replacewith = (String)input.get(2);
             return source.replaceAll(target, replacewith);
         }catch(Exception e){
-            System.err.println("Failed to process input; error - " + e.getMessage());
+            log.warn("Failed to process input; error - " + e.getMessage());
             return null;
         }
     }
 
-    //@Override
-//    /**
-//     * This method gives a name to the column.
-//     * @param input - schema of the input data
-//     * @return schema of the input data
-//     */
-//    public Schema outputSchema(Schema input) {
-//        return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
-//    }
-
     @Override
     public Schema outputSchema(Schema input) {
         return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));

Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java Tue Mar  9 06:28:35 2010
@@ -19,57 +19,54 @@
 package org.apache.pig.piggybank.evaluation.string;
 
 import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
 
 import org.apache.pig.EvalFunc;
+import org.apache.pig.PigWarning;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.DataType;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.FuncSpec;
+
 
 
 /**
  * string.SUBSTRING implements eval function to get a part of a string.
- * Example:
+ * Example:<code>
  *      register pigudfs.jar;
  *      A = load 'mydata' as (name);
  *      B = foreach A generate string.SUBSTRING(name, 10, 12);
  *      dump B;
+ *      </code>
+ * First argument is the string to take a substring of.<br>
+ * Second argument is the index of the first character of substring.<br>
+ * Third argument is the index of the last character of substring.<br>
+ * if the last argument is past the end of the string, substring of (beginIndex, length(str)) is returned.
  */
-public class SUBSTRING extends EvalFunc<String>
-{
+public class SUBSTRING extends EvalFunc<String> {
+
     /**
      * Method invoked on every tuple during foreach evaluation
      * @param input tuple; first column is assumed to have the column to convert
      * @exception java.io.IOException
      */
     public String exec(Tuple input) throws IOException {
-        if (input == null || input.size() == 0)
+        if (input == null || input.size() < 3) {
+            log.warn("invalid number of arguments to SUBSTRING");
             return null;
-
-        try{
+        }
+        try {
             String source = (String)input.get(0);
             Integer beginindex = (Integer)input.get(1);
             Integer endindex = (Integer)input.get(2);
-            return source.substring(beginindex, endindex);
-        }catch(Exception e){
-            System.err.println("Failed to process input; error - " + e.getMessage());
+            return source.substring(beginindex, Math.min(source.length(), endindex));
+        } catch (NullPointerException npe) {
+            log.warn(npe.toString());
+            return null;
+        } catch (ClassCastException e) {
+            log.warn(e.toString());
             return null;
         }
     }
 
-    //@Override
-//    /**
-//     * This method gives a name to the column.
-//     * @param input - schema of the input data
-//     * @return schema of the input data
-//     */
-//    public Schema outputSchema(Schema input) {
-//        return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
-//    }
-
     @Override
     public Schema outputSchema(Schema input) {
         return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));

Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java Tue Mar  9 06:28:35 2010
@@ -23,6 +23,7 @@ import java.util.List;
 import java.util.ArrayList;
 
 import org.apache.pig.EvalFunc;
+import org.apache.pig.PigWarning;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.DataType;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
@@ -38,46 +39,48 @@ import org.apache.pig.FuncSpec;
  *      B = foreach A generate string.UPPER(name);
  *      dump B;
  */
-public class UPPER extends EvalFunc<String>
-{
+public class UPPER extends EvalFunc<String> {
+
     /** 
-     * Method invoked on every tuple during foreach evaluation
+     * Upper-cases an input string.
      * @param input tuple; first column is assumed to have the column to convert
-     * @param output - resulting value
+     * 
      * @exception IOException
      */
     public String exec(Tuple input) throws IOException {
         if (input == null || input.size() == 0)
             return null;
 
-        try{
-            String str = (String)input.get(0);
-            return str.toUpperCase();
-        }catch(Exception e){
-            System.err.println("Failed to process input; error - " + e.getMessage());
+        String str = null;
+        try {
+            str = (String)input.get(0);
+        } catch (ClassCastException e) {
+            warn("unable to cast input "+input.get(0)+" of class "+
+                    input.get(0).getClass()+" to String", PigWarning.UDF_WARNING_1);
             return null;
         }
+        return str.toUpperCase();
     }
 
-    //@Override
     /**
      * This method gives a name to the column. 
      * @param input - schema of the input data
      * @return schema of the input data
      */
+    @Override
     public Schema outputSchema(Schema input) {
         return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
     }
 
-     /* (non-Javadoc)
-      * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
-      */
-     @Override
-     public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+    /* (non-Javadoc)
+     * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+     */
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
         List<FuncSpec> funcList = new ArrayList<FuncSpec>();
         funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
 
         return funcList;
-     }
+    }
 
 }

Modified: hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java Tue Mar  9 06:28:35 2010
@@ -17,11 +17,14 @@
  */
 package org.apache.pig.piggybank.test.evaluation.string;
 
+import java.io.IOException;
+
 import junit.framework.TestCase;
 
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.apache.pig.piggybank.evaluation.string.RegexExtract;
+import org.apache.pig.piggybank.evaluation.string.RegexExtractAll;
 import org.apache.pig.piggybank.evaluation.string.RegexMatch;
 import org.junit.Test;
 
@@ -74,4 +77,33 @@ public class TestRegex extends TestCase 
         r = func.exec(t3);
         assertTrue(r==null);
     }
+    
+    @Test
+    public void testRegexExtractAll() throws IOException {
+        String matchRegex = "^(.+)\\b\\s+is a\\s+\\b(.+)$";
+        TupleFactory tupleFactory = TupleFactory.getInstance();
+        Tuple t1 = tupleFactory.newTuple(2);
+        t1.set(0,"this is a match");
+        t1.set(1, matchRegex);
+        
+        Tuple t2 = tupleFactory.newTuple(2);
+        t2.set(0, "no match");
+        t2.set(1, matchRegex);
+        
+        Tuple t3 = tupleFactory.newTuple(2);
+        t3.set(0, null);
+        t3.set(1, matchRegex);
+     
+        RegexExtractAll func = new RegexExtractAll();
+        Tuple r = func.exec(t1);
+        assertEquals(r.size(), 2);
+        assertEquals("this", r.get(0));
+        assertEquals("match", r.get(1));
+        
+        r = func.exec(t2);
+        assertTrue(r==null);
+        
+        r = func.exec(t3);
+        assertTrue(r==null);
+    }
 }