You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2012/03/02 01:07:36 UTC

svn commit: r1296005 - in /pig/trunk: CHANGES.txt src/org/apache/pig/builtin/REGEX_EXTRACT.java src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java test/org/apache/pig/test/TestBuiltin.java

Author: daijy
Date: Fri Mar  2 00:07:36 2012
New Revision: 1296005

URL: http://svn.apache.org/viewvc?rev=1296005&view=rev
Log:
PIG-2514: REGEX_EXTRACT not returning correct group with non greedy regex

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java
    pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java
    pig/trunk/test/org/apache/pig/test/TestBuiltin.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1296005&r1=1296004&r2=1296005&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri Mar  2 00:07:36 2012
@@ -102,6 +102,8 @@ OPTIMIZATIONS
 
 BUG FIXES
 
+PIG-2514: REGEX_EXTRACT not returning correct group with non greedy regex (romainr via daijy)
+
 PIG-2532: Registered classes fail deserialization in frontend (traviscrawford via julien)
 
 PIG-2549: org.apache.pig.piggybank.storage.avro - Broken documentation link for AvroStorage (chrisas via daijy)

Modified: pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java?rev=1296005&r1=1296004&r2=1296005&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java (original)
+++ pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT.java Fri Mar  2 00:07:36 2012
@@ -41,12 +41,24 @@ import org.apache.pig.impl.logicalLayer.
 * <dd><code>match_index</code>-<code>index of the group to extract</code>.</dd>
 * <dt><b>Output:</b></dt>
 * <dd><code>extracted group, if fail, return null</code>.</dd>
+* <dt><b>Matching strategy:</b></dt>
+* <dd>Try to only match the first sequence by using {@link Matcher#find()} instead of
+* {@link Matcher#matches()} (default useMatches=false).</dd>
+* <dd><code>DEFINE NON_GREEDY_EXTRACT REGEX_EXTRACT(true);</code></dd>
 * </dl>
 */
 
 public class REGEX_EXTRACT extends EvalFunc<String> {
     String mExpression = null;
-    Pattern mPattern = null; 
+    Pattern mPattern = null;
+    boolean mUseMatches = false;
+
+    public REGEX_EXTRACT() {}
+
+    public REGEX_EXTRACT(boolean useMatches) {
+      this.mUseMatches = useMatches;
+    }
+
     @Override
     public Schema outputSchema(Schema input) {
       try {
@@ -56,6 +68,7 @@ public class REGEX_EXTRACT extends EvalF
       }
     }
 
+    @Override
     public String exec(Tuple input) throws IOException {
         if (input.size()!=3) {
             String msg = "RegexExtract : Only 3 parameters are allowed.";
@@ -81,16 +94,20 @@ public class REGEX_EXTRACT extends EvalF
             throw new IOException(msg);
         }
         int mIndex = (Integer)input.get(2);
-        
+
         Matcher m = mPattern.matcher((String)input.get(0));
-        if (m.find()&&m.groupCount()>=mIndex)
+
+        if (!mUseMatches&&m.find()||mUseMatches&&m.matches())
         {
-            return m.group(mIndex);
+            if (m.groupCount()>=mIndex)
+            {
+                return m.group(mIndex);
+            }
         }
         warn("RegexExtract : Cannot extract group for input "+input.get(0), PigWarning.UDF_WARNING_1);
         return null;
     }
-    
+
     @Override
     public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
         List<FuncSpec> funcList = new ArrayList<FuncSpec>();
@@ -100,5 +117,5 @@ public class REGEX_EXTRACT extends EvalF
         s.add(new Schema.FieldSchema(null, DataType.INTEGER));
         funcList.add(new FuncSpec(this.getClass().getName(), s));
         return funcList;
-    } 
+    }
 }

Modified: pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java?rev=1296005&r1=1296004&r2=1296005&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java (original)
+++ pig/trunk/src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java Fri Mar  2 00:07:36 2012
@@ -40,12 +40,22 @@ import org.apache.pig.impl.logicalLayer.
  * <dd><code>regex</code>-<code>regular expression</code>.</dd>
  * <dt><b>Output:</b></dt>
  * <dd><code>A tuple of matched strings</code>.</dd>
+ * <dt><b>Matching strategy:</b></dt>
+ * <dd>Trying to match the entire input by using {@link Matcher#matches()} instead of
+ * {@link Matcher#find()} (default useMatches=true).</dd>
+ * <dd><code>DEFINE GREEDY_EXTRACT REGEX_EXTRACT(false);</code></dd>
  * </dl>
  */
 
 public class REGEX_EXTRACT_ALL extends EvalFunc<Tuple> {
-
     private static TupleFactory tupleFactory = TupleFactory.getInstance();
+    boolean mUseMatches = true;
+
+    public REGEX_EXTRACT_ALL() {}
+
+    public REGEX_EXTRACT_ALL(boolean useMatches) {
+      this.mUseMatches = useMatches;
+    }
 
     @Override
     public Tuple exec(Tuple input) throws IOException {
@@ -72,7 +82,7 @@ public class REGEX_EXTRACT_ALL extends E
         }
 
         Matcher m = mPattern.matcher((String)input.get(0));
-        if (!m.matches()) {
+        if (mUseMatches&&!m.matches()||!mUseMatches&&!m.find()) {
             return null;
         }
         Tuple result = tupleFactory.newTuple(m.groupCount());
@@ -83,11 +93,11 @@ public class REGEX_EXTRACT_ALL extends E
     }
 
     String mExpression = null;
-    Pattern mPattern = null; 
+    Pattern mPattern = null;
     @Override
     public Schema outputSchema(Schema input) {
         try {
-            return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), 
+            return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
                     DataType.TUPLE));
         } catch (Exception e) {
             return null;
@@ -102,6 +112,6 @@ public class REGEX_EXTRACT_ALL extends E
         s.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
         funcList.add(new FuncSpec(this.getClass().getName(), s));
         return funcList;
-    } 
+    }
 }
 

Modified: pig/trunk/test/org/apache/pig/test/TestBuiltin.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestBuiltin.java?rev=1296005&r1=1296004&r2=1296005&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestBuiltin.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestBuiltin.java Fri Mar  2 00:07:36 2012
@@ -1520,6 +1520,11 @@ public class TestBuiltin {
         t3.set(0, null);
         t3.set(1, "^\\/search\\/iy\\/(.*?)\\/.*");
         t3.set(2, 2);
+        
+        Tuple t4 = tupleFactory.newTuple(3);
+        t4.set(0,"this is a match");
+        t4.set(1, "this is a (.+?)");
+        t4.set(2, 1);
 
         REGEX_EXTRACT func = new REGEX_EXTRACT();
         String r = func.exec(t1);
@@ -1528,6 +1533,12 @@ public class TestBuiltin {
         assertTrue(r==null);
         r = func.exec(t3);
         assertTrue(r==null);
+        r = func.exec(t4);
+        assertEquals("m", r);
+
+        func = new REGEX_EXTRACT(true);
+        r = func.exec(t4);
+        assertEquals("match", r);
 
         String matchRegex = "^(.+)\\b\\s+is a\\s+\\b(.+)$";
         TupleFactory tupleFactory = TupleFactory.getInstance();
@@ -1554,6 +1565,30 @@ public class TestBuiltin {
 
         re = funce.exec(te3);
         assertTrue(re==null);
+
+        matchRegex = "(.+?)(.+?)";
+        tupleFactory = TupleFactory.getInstance();
+        te1 = tupleFactory.newTuple(2);
+        te1.set(0,"this is a match");
+        te1.set(1, matchRegex);
+
+        funce = new REGEX_EXTRACT_ALL();
+        re = funce.exec(te1);
+        assertEquals(re.size(), 2);
+        assertEquals("t", re.get(0));
+        assertEquals("his is a match", re.get(1));
+
+        funce = new REGEX_EXTRACT_ALL(false);
+        re = funce.exec(te1);
+        assertEquals(re.size(), 2);
+        assertEquals("t", re.get(0));
+        assertEquals("h", re.get(1));
+
+        re = funce.exec(te2);
+        assertTrue(re==null);
+
+        re = funce.exec(te3);
+        assertTrue(re==null);
     }
 
     @Test