You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2012/03/02 01:07:36 UTC
svn commit: r1296005 - in /pig/trunk: CHANGES.txt
src/org/apache/pig/builtin/REGEX_EXTRACT.java
src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java
test/org/apache/pig/test/TestBuiltin.java
Author: daijy
Date: Fri Mar 2 00:07:36 2012
New Revision: 1296005
URL: http://svn.apache.org/viewvc?rev=1296005&view=rev
Log:
PIG-2514: REGEX_EXTRACT not returning correct group with non greedy regex
Modified:
pig/trunk/CHANGES.txt
pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java
pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java
pig/trunk/test/org/apache/pig/test/TestBuiltin.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1296005&r1=1296004&r2=1296005&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri Mar 2 00:07:36 2012
@@ -102,6 +102,8 @@ OPTIMIZATIONS
BUG FIXES
+PIG-2514: REGEX_EXTRACT not returning correct group with non greedy regex (romainr via daijy)
+
PIG-2532: Registered classes fail deserialization in frontend (traviscrawford via julien)
PIG-2549: org.apache.pig.piggybank.storage.avro - Broken documentation link for AvroStorage (chrisas via daijy)
Modified: pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java?rev=1296005&r1=1296004&r2=1296005&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java (original)
+++ pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java Fri Mar 2 00:07:36 2012
@@ -41,12 +41,24 @@ import org.apache.pig.impl.logicalLayer.
* <dd><code>match_index</code>-<code>index of the group to extract</code>.</dd>
* <dt><b>Output:</b></dt>
* <dd><code>extracted group, if fail, return null</code>.</dd>
+* <dt><b>Matching strategy:</b></dt>
+* <dd>Try to only match the first sequence by using {@link Matcher#find()} instead of
+* {@link Matcher#matches()} (default useMatches=false).</dd>
+* <dd><code>DEFINE NON_GREEDY_EXTRACT REGEX_EXTRACT(true);</code></dd>
* </dl>
*/
public class REGEX_EXTRACT extends EvalFunc<String> {
String mExpression = null;
- Pattern mPattern = null;
+ Pattern mPattern = null;
+ boolean mUseMatches = false;
+
+ public REGEX_EXTRACT() {}
+
+ public REGEX_EXTRACT(boolean useMatches) {
+ this.mUseMatches = useMatches;
+ }
+
@Override
public Schema outputSchema(Schema input) {
try {
@@ -56,6 +68,7 @@ public class REGEX_EXTRACT extends EvalF
}
}
+ @Override
public String exec(Tuple input) throws IOException {
if (input.size()!=3) {
String msg = "RegexExtract : Only 3 parameters are allowed.";
@@ -81,16 +94,20 @@ public class REGEX_EXTRACT extends EvalF
throw new IOException(msg);
}
int mIndex = (Integer)input.get(2);
-
+
Matcher m = mPattern.matcher((String)input.get(0));
- if (m.find()&&m.groupCount()>=mIndex)
+
+ if (!mUseMatches&&m.find()||mUseMatches&&m.matches())
{
- return m.group(mIndex);
+ if (m.groupCount()>=mIndex)
+ {
+ return m.group(mIndex);
+ }
}
warn("RegexExtract : Cannot extract group for input "+input.get(0), PigWarning.UDF_WARNING_1);
return null;
}
-
+
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
List<FuncSpec> funcList = new ArrayList<FuncSpec>();
@@ -100,5 +117,5 @@ public class REGEX_EXTRACT extends EvalF
s.add(new Schema.FieldSchema(null, DataType.INTEGER));
funcList.add(new FuncSpec(this.getClass().getName(), s));
return funcList;
- }
+ }
}
Modified: pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java?rev=1296005&r1=1296004&r2=1296005&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java (original)
+++ pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java Fri Mar 2 00:07:36 2012
@@ -40,12 +40,22 @@ import org.apache.pig.impl.logicalLayer.
* <dd><code>regex</code>-<code>regular expression</code>.</dd>
* <dt><b>Output:</b></dt>
* <dd><code>A tuple of matched strings</code>.</dd>
+ * <dt><b>Matching strategy:</b></dt>
+ * <dd>Trying to match the entire input by using {@link Matcher#matches()} instead of
+ * {@link Matcher#find()} (default useMatches=true).</dd>
+ * <dd><code>DEFINE GREEDY_EXTRACT REGEX_EXTRACT(false);</code></dd>
* </dl>
*/
public class REGEX_EXTRACT_ALL extends EvalFunc<Tuple> {
-
private static TupleFactory tupleFactory = TupleFactory.getInstance();
+ boolean mUseMatches = true;
+
+ public REGEX_EXTRACT_ALL() {}
+
+ public REGEX_EXTRACT_ALL(boolean useMatches) {
+ this.mUseMatches = useMatches;
+ }
@Override
public Tuple exec(Tuple input) throws IOException {
@@ -72,7 +82,7 @@ public class REGEX_EXTRACT_ALL extends E
}
Matcher m = mPattern.matcher((String)input.get(0));
- if (!m.matches()) {
+ if (mUseMatches&&!m.matches()||!mUseMatches&&!m.find()) {
return null;
}
Tuple result = tupleFactory.newTuple(m.groupCount());
@@ -83,11 +93,11 @@ public class REGEX_EXTRACT_ALL extends E
}
String mExpression = null;
- Pattern mPattern = null;
+ Pattern mPattern = null;
@Override
public Schema outputSchema(Schema input) {
try {
- return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
+ return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
DataType.TUPLE));
} catch (Exception e) {
return null;
@@ -102,6 +112,6 @@ public class REGEX_EXTRACT_ALL extends E
s.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
funcList.add(new FuncSpec(this.getClass().getName(), s));
return funcList;
- }
+ }
}
Modified: pig/trunk/test/org/apache/pig/test/TestBuiltin.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestBuiltin.java?rev=1296005&r1=1296004&r2=1296005&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestBuiltin.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestBuiltin.java Fri Mar 2 00:07:36 2012
@@ -1520,6 +1520,11 @@ public class TestBuiltin {
t3.set(0, null);
t3.set(1, "^\\/search\\/iy\\/(.*?)\\/.*");
t3.set(2, 2);
+
+ Tuple t4 = tupleFactory.newTuple(3);
+ t4.set(0,"this is a match");
+ t4.set(1, "this is a (.+?)");
+ t4.set(2, 1);
REGEX_EXTRACT func = new REGEX_EXTRACT();
String r = func.exec(t1);
@@ -1528,6 +1533,12 @@ public class TestBuiltin {
assertTrue(r==null);
r = func.exec(t3);
assertTrue(r==null);
+ r = func.exec(t4);
+ assertEquals("m", r);
+
+ func = new REGEX_EXTRACT(true);
+ r = func.exec(t4);
+ assertEquals("match", r);
String matchRegex = "^(.+)\\b\\s+is a\\s+\\b(.+)$";
TupleFactory tupleFactory = TupleFactory.getInstance();
@@ -1554,6 +1565,30 @@ public class TestBuiltin {
re = funce.exec(te3);
assertTrue(re==null);
+
+ matchRegex = "(.+?)(.+?)";
+ tupleFactory = TupleFactory.getInstance();
+ te1 = tupleFactory.newTuple(2);
+ te1.set(0,"this is a match");
+ te1.set(1, matchRegex);
+
+ funce = new REGEX_EXTRACT_ALL();
+ re = funce.exec(te1);
+ assertEquals(re.size(), 2);
+ assertEquals("t", re.get(0));
+ assertEquals("his is a match", re.get(1));
+
+ funce = new REGEX_EXTRACT_ALL(false);
+ re = funce.exec(te1);
+ assertEquals(re.size(), 2);
+ assertEquals("t", re.get(0));
+ assertEquals("h", re.get(1));
+
+ re = funce.exec(te2);
+ assertTrue(re==null);
+
+ re = funce.exec(te3);
+ assertTrue(re==null);
}
@Test