You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by dv...@apache.org on 2010/03/09 07:28:35 UTC
svn commit: r920710 - in /hadoop/pig/trunk: ./
contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/
contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/
Author: dvryaboy
Date: Tue Mar 9 06:28:35 2010
New Revision: 920710
URL: http://svn.apache.org/viewvc?rev=920710&view=rev
Log:
PIG-1248: [piggybank] some useful String functions
Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java
hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java
hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java
Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Tue Mar 9 06:28:35 2010
@@ -66,6 +66,8 @@ manner (rding via pradeepkth)
IMPROVEMENTS
+PIG-1248: [piggybank] some useful String functions (dvryaboy)
+
PIG-1251: Move SortInfo calculation earlier in compilation (ashutoshc)
PIG-1233: NullPointerException in AVG (ankur via olgan)
Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/INDEXOF.java Tue Mar 9 06:28:35 2010
@@ -19,47 +19,50 @@
package org.apache.pig.piggybank.evaluation.string;
import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.FuncSpec;
/**
- * string.INSTR implements eval function to search for a string
+ * string.INDEXOF implements eval function to search for a string
* Example:
* register pigudfs.jar;
* A = load 'mydata' as (name);
* B = foreach A generate string.INDEXOF(name, ",");
* dump B;
*/
-public class INDEXOF extends EvalFunc<Integer>
-{
+public class INDEXOF extends EvalFunc<Integer> {
+
+ private static final Log log = LogFactory.getLog(INDEXOF.class);
+
/**
* Method invoked on every tuple during foreach evaluation
* @param input tuple; first column is assumed to have the column to convert
* the second column is the string we search for
* the third is an optional column from where to start the search
* @exception java.io.IOException
+ * @return index of first occurrence, or null in case of processing error
*/
public Integer exec(Tuple input) throws IOException {
- if (input == null || input.size() == 0)
+ if (input == null || input.size() < 2) {
+ log.warn("invalid input tuple: "+input);
return null;
-
- try{
+ }
+ try {
String str = (String)input.get(0);
String search = (String)input.get(1);
int fromIndex = 0;
- if (input.size() ==3)
- fromIndex = (Integer)input.get(1);
+ if (input.size() >=3)
+ fromIndex = (Integer)input.get(2);
return str.indexOf(search, fromIndex);
- }catch(Exception e){
- System.err.println("Failed to process input; error - " + e.getMessage());
+ } catch(Exception e){
+ log.warn("Failed to process input; error - " + e.getMessage());
return null;
}
}
Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LASTINDEXOF.java Tue Mar 9 06:28:35 2010
@@ -19,42 +19,47 @@
package org.apache.pig.piggybank.evaluation.string;
import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.FuncSpec;
-
/**
- * string.INSTR implements eval function to search for the last occurance of a string
+ * string.INSTR implements eval function to search for the last occurrence of a string<br>
+ * Returns null on error<br>
* Example:
+ * <code>
* register pigudfs.jar;
* A = load 'mydata' as (name);
* B = foreach A generate string.LASTINDEXOF(name, ",");
* dump B;
+ * </code>
*/
-public class LASTINDEXOF extends EvalFunc<Integer>
-{
+public class LASTINDEXOF extends EvalFunc<Integer> {
+ private static final Log log = LogFactory.getLog(LASTINDEXOF.class);
+
/**
- * Method invoked on every tuple during foreach evaluation
- * @param input tuple; first column is assumed to have the column to convert
+ * Finds the last location of a substring in a given string.
+ * @param input tuple:<ol>
+ * <li>the string to process
+ * <li>the substring to find
+ * </ol>
* @exception java.io.IOException
+ * @return last location of substring, or null in case of processing errors.
*/
public Integer exec(Tuple input) throws IOException {
- if (input == null || input.size() == 0)
+ if (input == null || input.size() < 2)
return null;
- try{
+ try {
String str = (String)input.get(0);
String search = (String)input.get(1);
return str.lastIndexOf(search);
- }catch(Exception e){
- System.err.println("Failed to process input; error - " + e.getMessage());
+ } catch(Exception e) {
+ log.warn("Failed to process input; error - " + e.getMessage());
return null;
}
}
Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/LOWER.java Tue Mar 9 06:28:35 2010
@@ -38,8 +38,8 @@ import org.apache.pig.FuncSpec;
* B = foreach A generate string.LOWER(name);
* dump B;
*/
-public class LOWER extends EvalFunc<String>
-{
+public class LOWER extends EvalFunc<String> {
+
/**
* Method invoked on every tuple during foreach evaluation
* @param input tuple; first column is assumed to have the column to convert
@@ -49,21 +49,21 @@ public class LOWER extends EvalFunc<Stri
if (input == null || input.size() == 0)
return null;
- try{
+ try {
String str = (String)input.get(0);
return str.toLowerCase();
- }catch(Exception e){
- System.err.println("Failed to process input; error - " + e.getMessage());
+ } catch(Exception e){
+ log.warn("Failed to process input; error - " + e.getMessage());
return null;
}
}
- //@Override
/**
* This method gives a name to the column.
* @param input - schema of the input data
* @return schema of the input data
*/
+ @Override
public Schema outputSchema(Schema input) {
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
}
Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/REPLACE.java Tue Mar 9 06:28:35 2010
@@ -19,24 +19,25 @@
package org.apache.pig.piggybank.evaluation.string;
import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.FuncSpec;
/**
- * string.REPLACE implements eval function to replace part ofa string.
- * Example:
+ * string.REPLACE implements eval function to replace part of a string.
+ * Example:<code>
* register pigudfs.jar;
* A = load 'mydata' as (name);
* B = foreach A generate string.REPLACE(name, 'blabla', 'bla');
* dump B;
+ * </code>
+ * The first argument is a string on which to perform the operation. The second argument
+ * is treated as a regular expression. The third argument is the replacement string.
+ * This is a wrapper around Java's String.replaceAll(String, String);
+ *
*/
public class REPLACE extends EvalFunc<String>
{
@@ -46,7 +47,7 @@ public class REPLACE extends EvalFunc<St
* @exception java.io.IOException
*/
public String exec(Tuple input) throws IOException {
- if (input == null || input.size() == 0)
+ if (input == null || input.size() < 3)
return null;
try{
@@ -55,21 +56,11 @@ public class REPLACE extends EvalFunc<St
String replacewith = (String)input.get(2);
return source.replaceAll(target, replacewith);
}catch(Exception e){
- System.err.println("Failed to process input; error - " + e.getMessage());
+ log.warn("Failed to process input; error - " + e.getMessage());
return null;
}
}
- //@Override
-// /**
-// * This method gives a name to the column.
-// * @param input - schema of the input data
-// * @return schema of the input data
-// */
-// public Schema outputSchema(Schema input) {
-// return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
-// }
-
@Override
public Schema outputSchema(Schema input) {
return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/SUBSTRING.java Tue Mar 9 06:28:35 2010
@@ -19,57 +19,54 @@
package org.apache.pig.piggybank.evaluation.string;
import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
import org.apache.pig.EvalFunc;
+import org.apache.pig.PigWarning;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.FuncSpec;
+
/**
* string.SUBSTRING implements eval function to get a part of a string.
- * Example:
+ * Example:<code>
* register pigudfs.jar;
* A = load 'mydata' as (name);
* B = foreach A generate string.SUBSTRING(name, 10, 12);
* dump B;
+ * </code>
+ * First argument is the string to take a substring of.<br>
+ * Second argument is the index of the first character of substring.<br>
+ * Third argument is the index of the last character of substring.<br>
+ * if the last argument is past the end of the string, substring of (beginIndex, length(str)) is returned.
*/
-public class SUBSTRING extends EvalFunc<String>
-{
+public class SUBSTRING extends EvalFunc<String> {
+
/**
* Method invoked on every tuple during foreach evaluation
* @param input tuple; first column is assumed to have the column to convert
* @exception java.io.IOException
*/
public String exec(Tuple input) throws IOException {
- if (input == null || input.size() == 0)
+ if (input == null || input.size() < 3) {
+ log.warn("invalid number of arguments to SUBSTRING");
return null;
-
- try{
+ }
+ try {
String source = (String)input.get(0);
Integer beginindex = (Integer)input.get(1);
Integer endindex = (Integer)input.get(2);
- return source.substring(beginindex, endindex);
- }catch(Exception e){
- System.err.println("Failed to process input; error - " + e.getMessage());
+ return source.substring(beginindex, Math.min(source.length(), endindex));
+ } catch (NullPointerException npe) {
+ log.warn(npe.toString());
+ return null;
+ } catch (ClassCastException e) {
+ log.warn(e.toString());
return null;
}
}
- //@Override
-// /**
-// * This method gives a name to the column.
-// * @param input - schema of the input data
-// * @return schema of the input data
-// */
-// public Schema outputSchema(Schema input) {
-// return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
-// }
-
@Override
public Schema outputSchema(Schema input) {
return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
Modified: hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/string/UPPER.java Tue Mar 9 06:28:35 2010
@@ -23,6 +23,7 @@ import java.util.List;
import java.util.ArrayList;
import org.apache.pig.EvalFunc;
+import org.apache.pig.PigWarning;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
@@ -38,46 +39,48 @@ import org.apache.pig.FuncSpec;
* B = foreach A generate string.UPPER(name);
* dump B;
*/
-public class UPPER extends EvalFunc<String>
-{
+public class UPPER extends EvalFunc<String> {
+
/**
- * Method invoked on every tuple during foreach evaluation
+ * Upper-cases an input string.
* @param input tuple; first column is assumed to have the column to convert
- * @param output - resulting value
+ *
* @exception IOException
*/
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
- try{
- String str = (String)input.get(0);
- return str.toUpperCase();
- }catch(Exception e){
- System.err.println("Failed to process input; error - " + e.getMessage());
+ String str = null;
+ try {
+ str = (String)input.get(0);
+ } catch (ClassCastException e) {
+ warn("unable to cast input "+input.get(0)+" of class "+
+ input.get(0).getClass()+" to String", PigWarning.UDF_WARNING_1);
return null;
}
+ return str.toUpperCase();
}
- //@Override
/**
* This method gives a name to the column.
* @param input - schema of the input data
* @return schema of the input data
*/
+ @Override
public Schema outputSchema(Schema input) {
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.CHARARRAY));
}
- /* (non-Javadoc)
- * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
- */
- @Override
- public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+ /* (non-Javadoc)
+ * @see org.apache.pig.EvalFunc#getArgToFuncMapping()
+ */
+ @Override
+ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
List<FuncSpec> funcList = new ArrayList<FuncSpec>();
funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY))));
return funcList;
- }
+ }
}
Modified: hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java?rev=920710&r1=920709&r2=920710&view=diff
==============================================================================
--- hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java (original)
+++ hadoop/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/string/TestRegex.java Tue Mar 9 06:28:35 2010
@@ -17,11 +17,14 @@
*/
package org.apache.pig.piggybank.test.evaluation.string;
+import java.io.IOException;
+
import junit.framework.TestCase;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.piggybank.evaluation.string.RegexExtract;
+import org.apache.pig.piggybank.evaluation.string.RegexExtractAll;
import org.apache.pig.piggybank.evaluation.string.RegexMatch;
import org.junit.Test;
@@ -74,4 +77,33 @@ public class TestRegex extends TestCase
r = func.exec(t3);
assertTrue(r==null);
}
+
+ @Test
+ public void testRegexExtractAll() throws IOException {
+ String matchRegex = "^(.+)\\b\\s+is a\\s+\\b(.+)$";
+ TupleFactory tupleFactory = TupleFactory.getInstance();
+ Tuple t1 = tupleFactory.newTuple(2);
+ t1.set(0,"this is a match");
+ t1.set(1, matchRegex);
+
+ Tuple t2 = tupleFactory.newTuple(2);
+ t2.set(0, "no match");
+ t2.set(1, matchRegex);
+
+ Tuple t3 = tupleFactory.newTuple(2);
+ t3.set(0, null);
+ t3.set(1, matchRegex);
+
+ RegexExtractAll func = new RegexExtractAll();
+ Tuple r = func.exec(t1);
+ assertEquals(r.size(), 2);
+ assertEquals("this", r.get(0));
+ assertEquals("match", r.get(1));
+
+ r = func.exec(t2);
+ assertTrue(r==null);
+
+ r = func.exec(t3);
+ assertTrue(r==null);
+ }
}