You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2017/04/12 17:03:19 UTC
svn commit: r1791153 - in /pig/trunk: CHANGES.txt
src/docs/src/documentation/content/xdocs/func.xml
src/org/apache/pig/builtin/REGEX_SEARCH.java
test/org/apache/pig/test/TestBuiltin.java
Author: daijy
Date: Wed Apr 12 17:03:19 2017
New Revision: 1791153
URL: http://svn.apache.org/viewvc?rev=1791153&view=rev
Log:
PIG-5214: search any substring in the input string
Added:
pig/trunk/src/org/apache/pig/builtin/REGEX_SEARCH.java
Modified:
pig/trunk/CHANGES.txt
pig/trunk/src/docs/src/documentation/content/xdocs/func.xml
pig/trunk/test/org/apache/pig/test/TestBuiltin.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1791153&r1=1791152&r2=1791153&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Apr 12 17:03:19 2017
@@ -36,6 +36,8 @@ PIG-5067: Revisit union on numeric type
IMPROVEMENTS
+PIG-5214: search any substring in the input string (rainer-46 via daijy)
+
PIG-5210: Option to print MR/Tez plan before launching (ly16 via daijy)
PIG-5175: Upgrade jruby to 1.7.26 (daijy)
Modified: pig/trunk/src/docs/src/documentation/content/xdocs/func.xml
URL: http://svn.apache.org/viewvc/pig/trunk/src/docs/src/documentation/content/xdocs/func.xml?rev=1791153&r1=1791152&r2=1791153&view=diff
==============================================================================
--- pig/trunk/src/docs/src/documentation/content/xdocs/func.xml (original)
+++ pig/trunk/src/docs/src/documentation/content/xdocs/func.xml Wed Apr 12 17:03:19 2017
@@ -4352,7 +4352,74 @@ REGEX_EXTRACT_ALL('192.168.1.5:8020', '(
</section>
-<!-- ======================================================== -->
+<!-- ======================================================== -->
+ <section id="regex-search">
+ <title>REGEX_SEARCH</title>
+ <p>Performs regular expression matching and searches all matched characters in a string.</p>
+
+<section>
+ <title>Syntax</title>
+ <table>
+ <tr>
+ <td>
+ <p>REGEX_SEARCH(string, 'regExp');</p>
+ </td>
+ </tr>
+ </table>
+ </section>
+
+<section>
+ <title>Terms</title>
+ <table>
+ <tr>
+ <td>
+ <p>string</p>
+ </td>
+ <td>
+ <p>The string in which to perform the match.</p>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <p>'regExp'</p>
+ </td>
+ <td>
+ <p>The regular expression to which the string is to be matched, in quotes.</p>
+ </td>
+ </tr>
+ </table>
+</section>
+
+<section>
+ <title>Usage</title>
+ <p>
+Use the REGEX_SEARCH function to perform regular expression matching and to find all matched characters in a string.
+ </p>
+ <p>
+The function returns tuples which are placed in a bag. Each tuple only contains one field which represents a matched expression.
+ </p>
+</section>
+
+<section>
+ <title>Example</title>
+ <p>
+This is example will return the bag {(=04 ),(=06 ),(=96 )}.
+ </p>
+<source>
+REGEX_SEARCH('a=04 b=06 c=96 or more', '(=\\d+\\s)');
+</source>
+ <p>
+And this is example will return the bag {(04),(06),(96)}.
+ </p>
+<source>
+REGEX_SEARCH('a=04 b=06 c=96 or more', '=(\\d+)\\s');
+</source>
+
+ </section>
+</section>
+
+
+<!-- ======================================================== -->
<section id="replace">
<title>REPLACE</title>
<p>Replaces existing characters in a string with new characters.</p>
Added: pig/trunk/src/org/apache/pig/builtin/REGEX_SEARCH.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/REGEX_SEARCH.java?rev=1791153&view=auto
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/REGEX_SEARCH.java (added)
+++ pig/trunk/src/org/apache/pig/builtin/REGEX_SEARCH.java Wed Apr 12 17:03:19 2017
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.builtin;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.util.*;
+
+/**
+ * Search and find all matched characters in a string with a given
+ * regular expression.
+ *
+ * Example:
+ *
+ * a = LOAD 'mydata' AS (name:chararray);
+ * b = FOREACH A GENERATE REGEX_SEARCH(name, 'regEx');
+ *
+ * input tuple: the first field is a string on which performs regular expression matching;
+ * the second field is the regular expression;
+ */
+
+public class REGEX_SEARCH extends EvalFunc<DataBag> {
+ private static BagFactory bagFactory = BagFactory.getInstance();
+ private static TupleFactory tupleFactory = TupleFactory.getInstance();
+
+ public REGEX_SEARCH() {}
+
+ @Override
+ public DataBag exec(Tuple input) throws IOException {
+
+ if (input == null || input.size() < 1) {
+ return null;
+ }
+ if (input.get(0)==null)
+ return null;
+
+ try {
+ if (!input.get(1).equals(mExpression)) {
+ try {
+ mExpression = (String)input.get(1);
+ mPattern = Pattern.compile(mExpression);
+ } catch (Exception e) {
+ String msg = "StringSearchAll : Mal-Formed Regular expression : "+input.get(1);
+ throw new IOException(msg);
+ }
+ }
+ } catch (NullPointerException e) {
+ String msg = "StringSearchAll : Regular expression is null";
+ throw new IOException(msg);
+ }
+ Matcher m = mPattern.matcher((String)input.get(0));
+ if (!m.find()) {
+ return null;
+ }
+
+ Tuple tuple0 = tupleFactory.newTuple(1);
+ tuple0.set(0, m.group(1));
+ DataBag dataBag = bagFactory.newDefaultBag();
+ dataBag.add(tuple0);
+ while (m.find()) {
+ Tuple tuple = tupleFactory.newTuple(1);
+ tuple.set(0, m.group(1));
+ dataBag.add(tuple);
+ }
+ return dataBag;
+ }
+
+ String mExpression = null;
+ Pattern mPattern = null;
+ @Override
+ public Schema outputSchema(Schema input) {
+ try {
+ return new Schema(Utils.getSchemaFromString("{(match:chararray)}"));
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public boolean allowCompileTimeCalculation() {
+ return true;
+ }
+
+}
Modified: pig/trunk/test/org/apache/pig/test/TestBuiltin.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestBuiltin.java?rev=1791153&r1=1791152&r2=1791153&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestBuiltin.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestBuiltin.java Wed Apr 12 17:03:19 2017
@@ -96,6 +96,7 @@ import org.apache.pig.builtin.ROUND_TO;
import org.apache.pig.builtin.RTRIM;
import org.apache.pig.builtin.SIZE;
import org.apache.pig.builtin.SPRINTF;
+import org.apache.pig.builtin.REGEX_SEARCH;
import org.apache.pig.builtin.STRSPLIT;
import org.apache.pig.builtin.SUBSTRING;
import org.apache.pig.builtin.SecondsBetween;
@@ -1993,6 +1994,32 @@ public class TestBuiltin {
re = funce.exec(te3);
assertTrue(re==null);
+
+ // *** REGEX_SEARCH *** start
+ String matchSearch = "(=\\d+\\s)";
+ tupleFactory = TupleFactory.getInstance();
+ Tuple ts1 = tupleFactory.newTuple(2);
+ ts1.set(0, "a=04 b=06 c=96 or more");
+ ts1.set(1, matchSearch);
+
+ Tuple ts2 = tupleFactory.newTuple(2);
+ ts2.set(0, "a is 04 b is 06");
+ ts2.set(1, matchSearch);
+
+ Tuple ts3 = tupleFactory.newTuple(2);
+ ts3.set(0, null);
+ ts3.set(1, matchSearch);
+
+ REGEX_SEARCH funcs = new REGEX_SEARCH();
+ DataBag reb = funcs.exec(ts1);
+ DataBag b = Util.createBag(new Tuple[]{Util.buildTuple("=04 "), Util.buildTuple("=06 "), Util.buildTuple("=96 ")});
+ assertEquals(b, reb);
+
+ reb = funcs.exec(ts2);
+ assertTrue(reb==null);
+
+ reb = funcs.exec(ts3);
+ assertTrue(reb==null);
}
@Test