You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@streams.apache.org by mf...@apache.org on 2014/05/14 17:31:20 UTC

[1/8] git commit: Added new test cases

Repository: incubator-streams
Updated Branches:
  refs/heads/master e68300936 -> a1b02094e


Added new test cases


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/4ae10fd3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/4ae10fd3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/4ae10fd3

Branch: refs/heads/master
Commit: 4ae10fd3abdd2694d15fb102286cfc94931eacb2
Parents: 124e01e
Author: mfranklin <mf...@apache.org>
Authored: Tue May 13 12:26:58 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:09 2014 -0400

----------------------------------------------------------------------
 .../src/main/java/org/apache/streams/regex/RegexUtils.java       | 2 +-
 .../src/test/java/org/apache/streams/regex/RegexUtilsTest.java   | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/4ae10fd3/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
index bf5c03a..41c3ee5 100644
--- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
@@ -52,7 +52,7 @@ public class RegexUtils {
      * @return a non-null list of matches.
      */
     public static List<String> extractWordMatches(String pattern, String content) {
-        pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,]|$)";
+        pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,?]|$)";
         return getMatches(pattern, content, 2);
     }
 

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/4ae10fd3/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
index 7dad4ca..eed1327 100644
--- a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
@@ -56,7 +56,9 @@ public class RegexUtilsTest {
                 {"#\\w+", "This is #freakingcrazydude.", 1, 1},
                 {"#\\w+", "This is #freakingcrazydude!", 1, 1},
                 {"#\\w+", "This is #freakingcrazydude I went to the moon", 1, 1},
-                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude party", 2, 2},
+                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude? party", 2, 2},
+                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude; party", 2, 2},
+                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude, party", 2, 2},
                 {"#\\w+", "This is#freakingcrazydude I went to the #freakingcrazydude party", 2, 1},
                 {"#\\w+", "#what does the fox say?", 1, 1},
                 {"#\\w+", "#what does the fox #say", 2, 2}


[8/8] git commit: added mentions extractor

Posted by mf...@apache.org.
added mentions extractor


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/75578e99
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/75578e99
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/75578e99

Branch: refs/heads/master
Commit: 75578e994c2bfbeea4e69487138523348e99bedf
Parents: 37d378d
Author: mfranklin <mf...@apache.org>
Authored: Wed May 14 10:23:36 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:10 2014 -0400

----------------------------------------------------------------------
 .../streams/regex/RegexMentionsExtractor.java   | 48 +++++++++++
 .../regex/RegexMentionExtractorTest.java        | 83 ++++++++++++++++++++
 2 files changed, 131 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/75578e99/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java
new file mode 100644
index 0000000..dbf4540
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+import com.google.common.collect.Maps;
+import org.apache.streams.core.StreamsProcessor;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the @user mentions and add
+ * them to the appropriate extensions object
+ */
+public class RegexMentionsExtractor extends AbstractRegexExtensionExtractor<Map<String, Object>> implements StreamsProcessor {
+    public static final String DEFAULT_PATTERN = "@\\w+";
+    public static final String PATTERN_CONFIG_KEY = "MentionPattern";
+    public static final String EXTENSION_KEY = "user_mentions";
+    public static final String DISPLAY_KEY = "displayName";
+
+    protected RegexMentionsExtractor() {
+        super(PATTERN_CONFIG_KEY, EXTENSION_KEY, DEFAULT_PATTERN);
+    }
+
+    @Override
+    protected Map<String, Object> prepareObject(String extracted) {
+        HashMap<String, Object> mention = Maps.newHashMap();
+        mention.put(DISPLAY_KEY, extracted.substring(1));
+        return mention;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/75578e99/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java
new file mode 100644
index 0000000..0379c09
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.pojo.json.Activity;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.*;
+
+import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(Parameterized.class)
+public class RegexMentionExtractorTest {
+
+    private Activity activity;
+    private Set<Map<String, Object>> mentions;
+
+    public RegexMentionExtractorTest(String activityContent, Set<Map<String, Object>> hashtags) {
+        this.activity = new Activity();
+        this.activity.setContent(activityContent);
+        this.mentions = hashtags;
+    }
+
+    @Parameterized.Parameters
+    public static Collection<Object[]> params() {
+        return Arrays.asList(new Object[][]{
+                {"This is the @content of a standard tweet", Sets.newHashSet(new HashMap<String, Object>() {{
+                    put("displayName", "content");
+                }})},
+                {"This is the content of a standard tweet", Sets.newHashSet(new HashMap<String, Object>())},
+                {"This is the @content of a standard @tweet",  Sets.newHashSet(new HashMap<String, Object>() {{
+                    put("displayName", "content");
+                }},new HashMap<String, Object>() {{
+                    put("displayName", "tweet");
+                }})},
+                {"UNIX 时间1400000000 秒…… (该睡觉了,各位夜猫子)@程序员#", Sets.newHashSet(new HashMap<String, Object>() {{
+                    put("displayName", "程序员");
+                }})},
+                {"This is the body of a @fbpost. It can have multiple lines of #content, as well as much more detailed and flowery @language.",  Sets.newHashSet(new HashMap<String, Object>() {{
+                    put("displayName", "fbpost");
+                }},new HashMap<String, Object>() {{
+                    put("displayName", "language");
+                }})}
+        });
+    }
+
+    @Test
+    @SuppressWarnings("unchecked")
+    public void testExtraction() {
+        StreamsDatum datum = new StreamsDatum(activity, "Test");
+        List<StreamsDatum> result = new RegexMentionsExtractor().process(datum);
+        assertThat(result.size(), is(equalTo(1)));
+        Activity output = (Activity)result.get(0).getDocument();
+        Set<String> extracted = (Set) ensureExtensions(output).get(RegexMentionsExtractor.EXTENSION_KEY);
+        Sets.SetView<String> diff = Sets.difference(extracted, mentions);
+        assertThat(diff.size(), is(equalTo(0)));
+    }
+}


[7/8] git commit: Added hashtag processor

Posted by mf...@apache.org.
Added hashtag processor


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/204977ec
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/204977ec
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/204977ec

Branch: refs/heads/master
Commit: 204977ec87d039b12e89e51098d7137f19f5f7ab
Parents: 4ae10fd
Author: mfranklin <mf...@apache.org>
Authored: Tue May 13 16:10:55 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:10 2014 -0400

----------------------------------------------------------------------
 streams-contrib/streams-processor-regex/pom.xml | 11 ++-
 .../streams/regex/RegexHashtagExtractor.java    | 97 ++++++++++++++++++++
 .../org/apache/streams/regex/RegexUtils.java    | 26 ++++--
 .../regex/RegexHashtagExtractorTest.java        | 72 +++++++++++++++
 .../apache/streams/regex/RegexUtilsTest.java    | 14 +--
 5 files changed, 204 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/pom.xml
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/pom.xml b/streams-contrib/streams-processor-regex/pom.xml
index 094013d..57f661a 100644
--- a/streams-contrib/streams-processor-regex/pom.xml
+++ b/streams-contrib/streams-processor-regex/pom.xml
@@ -30,5 +30,14 @@
 
     <artifactId>streams-processor-regex</artifactId>
 
-
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.streams</groupId>
+            <artifactId>streams-pojo</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.streams</groupId>
+            <artifactId>streams-core</artifactId>
+        </dependency>
+    </dependencies>
 </project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
new file mode 100644
index 0000000..fe392b7
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+import com.google.common.base.Strings;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.core.StreamsProcessor;
+import org.apache.streams.pojo.json.Activity;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
+
+/**
+ * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the Hashtags and add
+ * them to the appropriate extensions object
+ */
+public class RegexHashtagExtractor implements StreamsProcessor{
+
+    public final static String DEFAULT_PATTERN = "#\\w+";
+    public final static String PATTERN_CONFIG_KEY = "HashtagPattern";
+    public final static String EXTENSION_KEY = "hashtags";
+
+    private String hashPattern;
+
+    public String getHashPattern() {
+        return hashPattern;
+    }
+
+    @Override
+    public List<StreamsDatum> process(StreamsDatum entry) {
+        if(!(entry.getDocument() instanceof Activity)) {
+            return Lists.newArrayList();
+        }
+        if(Strings.isNullOrEmpty(hashPattern)) {
+            prepare(null);
+        }
+        Activity activity = (Activity)entry.getDocument();
+        Map<String, List<Integer>> matches = RegexUtils.extractMatches(hashPattern, activity.getContent());
+        Set<String> hashtags = ensureHashtagsExtension(activity);
+        for(String key : matches.keySet()) {
+            hashtags.add(key.substring(1));
+        }
+        return Lists.newArrayList(entry);
+    }
+
+    @Override
+    public void prepare(Object configurationObject) {
+        if(configurationObject instanceof Map) {
+            if(((Map)configurationObject).containsKey(PATTERN_CONFIG_KEY)) {
+                hashPattern = (String)((Map)configurationObject).get(PATTERN_CONFIG_KEY);
+            }
+        } else if(configurationObject instanceof String) {
+            hashPattern = (String)configurationObject;
+        } else {
+            hashPattern = DEFAULT_PATTERN;
+        }
+    }
+
+    @Override
+    public void cleanUp() {
+        //NOP
+    }
+
+    protected Set<String> ensureHashtagsExtension(Activity activity) {
+        Map<String, Object> extensions = ensureExtensions(activity);
+        Set<String> hashtags;
+        if(extensions.containsKey(EXTENSION_KEY)) {
+            hashtags = Sets.newHashSet((Iterable<String>) extensions.get(EXTENSION_KEY));
+        } else {
+            hashtags = Sets.newHashSet();
+            extensions.put(EXTENSION_KEY, hashtags);
+        }
+        return hashtags;
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
index 41c3ee5..662fc98 100644
--- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
@@ -19,10 +19,11 @@
 
 package org.apache.streams.regex;
 
-import java.util.LinkedList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
 import java.util.List;
 import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -31,7 +32,7 @@ import java.util.regex.Pattern;
  */
 public class RegexUtils {
 
-    private static final Map<String, Pattern> patternCache = new ConcurrentHashMap<String, Pattern>();
+    private static final Map<String, Pattern> patternCache = Maps.newConcurrentMap();
 
     private RegexUtils() {}
 
@@ -41,7 +42,7 @@ public class RegexUtils {
      * @param content the complete content to find matches in.
      * @return a non-null list of matches.
      */
-    public static List<String> extractMatches(String pattern, String content) {
+    public static Map<String, List<Integer>> extractMatches(String pattern, String content) {
         return getMatches(pattern, content, -1);
     }
 
@@ -51,21 +52,28 @@ public class RegexUtils {
      * @param content the complete content to find matches in.
      * @return a non-null list of matches.
      */
-    public static List<String> extractWordMatches(String pattern, String content) {
+    public static Map<String, List<Integer>> extractWordMatches(String pattern, String content) {
         pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,?]|$)";
         return getMatches(pattern, content, 2);
     }
 
-    protected static List<String> getMatches(String pattern, String content, int capture) {
+    protected static Map<String, List<Integer>> getMatches(String pattern, String content, int capture) {
         Matcher m = getPattern(pattern).matcher(content);
-        List<String> result = new LinkedList<String>();
+        Map<String, List<Integer>> matches = Maps.newHashMap();
         while(m.find()) {
             String group = capture > 0 ? m.group(capture) : m.group();
             if(group != null && !group.equals("")) {
-                result.add(group);
+                List<Integer> indices;
+                if(matches.containsKey(group)) {
+                    indices = matches.get(group);
+                } else {
+                    indices = Lists.newArrayList();
+                    matches.put(group, indices);
+                }
+                indices.add(m.start());
             }
         }
-        return result;
+        return matches;
     }
 
     private static Pattern getPattern(String pattern) {

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
new file mode 100644
index 0000000..d2912b7
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.pojo.json.Activity;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+
+import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(Parameterized.class)
+public class RegexHashtagExtractorTest {
+
+    private Activity activity;
+    private Set<String> hashtags;
+
+    public RegexHashtagExtractorTest(String activityContent, Set<String> hashtags) {
+        this.activity = new Activity();
+        this.activity.setContent(activityContent);
+        this.hashtags = hashtags;
+    }
+
+    @Parameterized.Parameters
+    public static Collection<Object[]> params() {
+        return Arrays.asList(new Object[][]{
+                {"This is the #content of a standard tweet", Sets.newHashSet("content")},
+                {"This is the content of a standard tweet", Sets.newHashSet()},
+                {"This is the #content of a standard #tweet", Sets.newHashSet("content", "tweet")},
+                {"This is the body of a #fbpost.  It can have multiple lines of #content, as well as much more detailed and flowery #language.", Sets.newHashSet("content", "fbpost", "language")}
+        });
+    }
+
+    @Test
+    public void testExtraction() {
+        StreamsDatum datum = new StreamsDatum(activity, "Test");
+        List<StreamsDatum> result = new RegexHashtagExtractor().process(datum);
+        assertThat(result.size(), is(equalTo(1)));
+        Activity output = (Activity)result.get(0).getDocument();
+        Set<String> extracted = (Set) ensureExtensions(output).get(RegexHashtagExtractor.EXTENSION_KEY);
+        Sets.SetView<String> diff = Sets.difference(extracted, hashtags);
+        assertThat(diff.size(), is(equalTo(0)));
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
index eed1327..fc2b9f6 100644
--- a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
@@ -27,6 +27,7 @@ import org.junit.runners.Parameterized;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
+import java.util.Map;
 
 import static org.hamcrest.CoreMatchers.equalTo;
 import static org.hamcrest.CoreMatchers.is;
@@ -56,10 +57,11 @@ public class RegexUtilsTest {
                 {"#\\w+", "This is #freakingcrazydude.", 1, 1},
                 {"#\\w+", "This is #freakingcrazydude!", 1, 1},
                 {"#\\w+", "This is #freakingcrazydude I went to the moon", 1, 1},
-                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude? party", 2, 2},
-                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude; party", 2, 2},
-                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude, party", 2, 2},
-                {"#\\w+", "This is#freakingcrazydude I went to the #freakingcrazydude party", 2, 1},
+                {"#\\w+", "This is #freakingcrazydude I went to the #crazy? party", 2, 2},
+                {"#\\w+", "This is #freakingcrazydude I went to the #crazy; party", 2, 2},
+                {"#\\w+", "This is #freakingcrazydude I went to the #crazy, party", 2, 2},
+                {"#\\w+", "This is#freakingcrazydude I went to the #crazy party", 2, 1},
+                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude party", 1, 1},
                 {"#\\w+", "#what does the fox say?", 1, 1},
                 {"#\\w+", "#what does the fox #say", 2, 2}
         });
@@ -68,10 +70,10 @@ public class RegexUtilsTest {
 
     @Test
     public void testMatches_simple() {
-        List<String> wordResults = RegexUtils.extractWordMatches(this.pattern, this.content);
+        Map<String, List<Integer>> wordResults = RegexUtils.extractWordMatches(this.pattern, this.content);
         assertThat(wordResults.size(), is(equalTo(wordMatchCount)));
 
-        List<String> regularResults = RegexUtils.extractMatches(this.pattern, this.content);
+        Map<String, List<Integer>> regularResults = RegexUtils.extractMatches(this.pattern, this.content);
         assertThat(regularResults.size(), is(equalTo(regularMatchCount)));
     }
 


[5/8] git commit: refactored hashtag processor to use new abstraction

Posted by mf...@apache.org.
refactored hashtag processor to use new abstraction


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/37d378de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/37d378de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/37d378de

Branch: refs/heads/master
Commit: 37d378de5d6acca0b6e906cc46322dcdb0ab0f08
Parents: 1bbfaca
Author: mfranklin <mf...@apache.org>
Authored: Wed May 14 10:23:01 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:10 2014 -0400

----------------------------------------------------------------------
 .../streams/regex/RegexHashtagExtractor.java    | 64 ++------------------
 .../regex/RegexHashtagExtractorTest.java        |  4 +-
 2 files changed, 8 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/37d378de/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
index fe392b7..1e565c8 100644
--- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
@@ -19,79 +19,25 @@
 
 package org.apache.streams.regex;
 
-import com.google.common.base.Strings;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-import org.apache.streams.core.StreamsDatum;
 import org.apache.streams.core.StreamsProcessor;
-import org.apache.streams.pojo.json.Activity;
-
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
 
 /**
  * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the Hashtags and add
  * them to the appropriate extensions object
  */
-public class RegexHashtagExtractor implements StreamsProcessor{
+public class RegexHashtagExtractor extends AbstractRegexExtensionExtractor<String> implements StreamsProcessor{
 
     public final static String DEFAULT_PATTERN = "#\\w+";
     public final static String PATTERN_CONFIG_KEY = "HashtagPattern";
     public final static String EXTENSION_KEY = "hashtags";
 
-    private String hashPattern;
-
-    public String getHashPattern() {
-        return hashPattern;
-    }
-
-    @Override
-    public List<StreamsDatum> process(StreamsDatum entry) {
-        if(!(entry.getDocument() instanceof Activity)) {
-            return Lists.newArrayList();
-        }
-        if(Strings.isNullOrEmpty(hashPattern)) {
-            prepare(null);
-        }
-        Activity activity = (Activity)entry.getDocument();
-        Map<String, List<Integer>> matches = RegexUtils.extractMatches(hashPattern, activity.getContent());
-        Set<String> hashtags = ensureHashtagsExtension(activity);
-        for(String key : matches.keySet()) {
-            hashtags.add(key.substring(1));
-        }
-        return Lists.newArrayList(entry);
+    public RegexHashtagExtractor() {
+        super(PATTERN_CONFIG_KEY, EXTENSION_KEY, DEFAULT_PATTERN);
     }
 
-    @Override
-    public void prepare(Object configurationObject) {
-        if(configurationObject instanceof Map) {
-            if(((Map)configurationObject).containsKey(PATTERN_CONFIG_KEY)) {
-                hashPattern = (String)((Map)configurationObject).get(PATTERN_CONFIG_KEY);
-            }
-        } else if(configurationObject instanceof String) {
-            hashPattern = (String)configurationObject;
-        } else {
-            hashPattern = DEFAULT_PATTERN;
-        }
-    }
 
     @Override
-    public void cleanUp() {
-        //NOP
-    }
-
-    protected Set<String> ensureHashtagsExtension(Activity activity) {
-        Map<String, Object> extensions = ensureExtensions(activity);
-        Set<String> hashtags;
-        if(extensions.containsKey(EXTENSION_KEY)) {
-            hashtags = Sets.newHashSet((Iterable<String>) extensions.get(EXTENSION_KEY));
-        } else {
-            hashtags = Sets.newHashSet();
-            extensions.put(EXTENSION_KEY, hashtags);
-        }
-        return hashtags;
+    protected String prepareObject(String extracted) {
+        return extracted.substring(1);
     }
 }

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/37d378de/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
index d2912b7..55e007e 100644
--- a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
@@ -55,11 +55,13 @@ public class RegexHashtagExtractorTest {
                 {"This is the #content of a standard tweet", Sets.newHashSet("content")},
                 {"This is the content of a standard tweet", Sets.newHashSet()},
                 {"This is the #content of a standard #tweet", Sets.newHashSet("content", "tweet")},
-                {"This is the body of a #fbpost.  It can have multiple lines of #content, as well as much more detailed and flowery #language.", Sets.newHashSet("content", "fbpost", "language")}
+                {"UNIX 时间1400000000 秒…… (该睡觉了,各位夜猫子)#程序员#", Sets.newHashSet("程序员")},
+                {"This is the body of a #fbpost. It can have multiple lines of #content, as well as much more detailed and flowery #language.", Sets.newHashSet("content", "fbpost", "language")}
         });
     }
 
     @Test
+    @SuppressWarnings("unchecked")
     public void testExtraction() {
         StreamsDatum datum = new StreamsDatum(activity, "Test");
         List<StreamsDatum> result = new RegexHashtagExtractor().process(datum);


[2/8] git commit: Added Regex Utility to aid in parsing of content

Posted by mf...@apache.org.
Added Regex Utility to aid in parsing of content


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/124e01e9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/124e01e9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/124e01e9

Branch: refs/heads/master
Commit: 124e01e9c3b89c500a877f31fc90ff168f7b52fa
Parents: 65822f2
Author: mfranklin <mf...@apache.org>
Authored: Tue May 13 12:23:38 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:09 2014 -0400

----------------------------------------------------------------------
 .../org/apache/streams/regex/RegexUtils.java    | 83 ++++++++++++++++++++
 .../apache/streams/regex/RegexUtilsTest.java    | 76 ++++++++++++++++++
 2 files changed, 159 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/124e01e9/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
new file mode 100644
index 0000000..bf5c03a
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Provides utilities for extracting matches from content
+ */
+public class RegexUtils {
+
+    private static final Map<String, Pattern> patternCache = new ConcurrentHashMap<String, Pattern>();
+
+    private RegexUtils() {}
+
+    /**
+     * Extracts matches of the given pattern in the content and returns them as a list.
+     * @param pattern the pattern for the substring to match.  For example, [0-9]* matches 911 in Emergency number is 911.
+     * @param content the complete content to find matches in.
+     * @return a non-null list of matches.
+     */
+    public static List<String> extractMatches(String pattern, String content) {
+        return getMatches(pattern, content, -1);
+    }
+
+    /**
+     * Extracts matches of the given pattern that are bounded by separation characters and returns them as a list.
+     * @param pattern the pattern for the substring to match.  For example, [0-9]* matches 911 in Emergency number is 911.
+     * @param content the complete content to find matches in.
+     * @return a non-null list of matches.
+     */
+    public static List<String> extractWordMatches(String pattern, String content) {
+        pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,]|$)";
+        return getMatches(pattern, content, 2);
+    }
+
+    protected static List<String> getMatches(String pattern, String content, int capture) {
+        Matcher m = getPattern(pattern).matcher(content);
+        List<String> result = new LinkedList<String>();
+        while(m.find()) {
+            String group = capture > 0 ? m.group(capture) : m.group();
+            if(group != null && !group.equals("")) {
+                result.add(group);
+            }
+        }
+        return result;
+    }
+
+    private static Pattern getPattern(String pattern) {
+        Pattern p;
+        if (patternCache.containsKey(pattern)) {
+            p = patternCache.get(pattern);
+        } else {
+            p = Pattern.compile(pattern);
+            patternCache.put(pattern, p);
+        }
+        return p;
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/124e01e9/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
new file mode 100644
index 0000000..7dad4ca
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+
+@RunWith(Parameterized.class)
+public class RegexUtilsTest {
+
+    private final String pattern;
+    private final String content;
+    private final int wordMatchCount;
+    private final int regularMatchCount;
+
+    public RegexUtilsTest(String pattern, String content, int regularMatchCount, int wordMatchCount) {
+        this.pattern = pattern;
+        this.content = content;
+        this.wordMatchCount = wordMatchCount;
+        this.regularMatchCount = regularMatchCount;
+    }
+
+    @Parameterized.Parameters
+    public static Collection<Object[]> parameters() {
+        return Arrays.asList(new Object[][]{
+                {"[0-9]*", "The number for emergencies is 911.", 1, 1},
+                {"#\\w+", "This is#freakingcrazydude.", 1, 0},
+                {"#\\w+", "This is #freakingcrazydude.", 1, 1},
+                {"#\\w+", "This is #freakingcrazydude!", 1, 1},
+                {"#\\w+", "This is #freakingcrazydude I went to the moon", 1, 1},
+                {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude party", 2, 2},
+                {"#\\w+", "This is#freakingcrazydude I went to the #freakingcrazydude party", 2, 1},
+                {"#\\w+", "#what does the fox say?", 1, 1},
+                {"#\\w+", "#what does the fox #say", 2, 2}
+        });
+    }
+
+
+    @Test
+    public void testMatches_simple() {
+        List<String> wordResults = RegexUtils.extractWordMatches(this.pattern, this.content);
+        assertThat(wordResults.size(), is(equalTo(wordMatchCount)));
+
+        List<String> regularResults = RegexUtils.extractMatches(this.pattern, this.content);
+        assertThat(regularResults.size(), is(equalTo(regularMatchCount)));
+    }
+
+}


[4/8] git commit: Added URL extractor

Posted by mf...@apache.org.
Added URL extractor


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/a1b02094
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/a1b02094
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/a1b02094

Branch: refs/heads/master
Commit: a1b02094ebd61233888635d6bcbe0ce383a6c009
Parents: 75578e9
Author: mfranklin <mf...@apache.org>
Authored: Wed May 14 11:18:06 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:10 2014 -0400

----------------------------------------------------------------------
 .../regex/AbstractRegexExtensionExtractor.java  |  5 +-
 .../apache/streams/regex/RegexUrlExtractor.java | 68 +++++++++++++++++++
 .../streams/regex/RegexUrlExtractorTest.java    | 70 ++++++++++++++++++++
 3 files changed, 141 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
index 6774962..23d1ad5 100644
--- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
@@ -26,6 +26,7 @@ import org.apache.streams.core.StreamsDatum;
 import org.apache.streams.core.StreamsProcessor;
 import org.apache.streams.pojo.json.Activity;
 
+import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -63,7 +64,7 @@ public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProce
         }
         Activity activity = (Activity)entry.getDocument();
         Map<String, List<Integer>> matches = RegexUtils.extractMatches(pattern, activity.getContent());
-        Set<T> entities = ensureMentionExtension(activity);
+        Collection<T> entities = ensureTargetObject(activity);
         for(String key : matches.keySet()) {
             entities.add(prepareObject(key));
         }
@@ -96,7 +97,7 @@ public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProce
     protected abstract T prepareObject(String extracted);
 
     @SuppressWarnings("unchecked")
-    protected Set<T> ensureMentionExtension(Activity activity) {
+    protected Collection<T> ensureTargetObject(Activity activity) {
         Map<String, Object> extensions = ensureExtensions(activity);
         Set<T> hashtags;
         if(extensions.containsKey(extensionKey)) {

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
new file mode 100644
index 0000000..5d37b3a
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+import org.apache.streams.core.StreamsProcessor;
+import org.apache.streams.pojo.json.Activity;
+
+import java.util.Collection;
+
+/**
+ * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the URLs and add
+ * them to the appropriate extensions object
+ */
+public class RegexUrlExtractor extends AbstractRegexExtensionExtractor<String> implements StreamsProcessor {
+
+    //Temporarily copied from streams-processor-urls so as not to force a dependency on that provider.  This should
+    //be moved to a common utility package
+    public final static String DEFAULT_PATTERN =
+            "(?:(?:https?|ftp)://)" +
+                    "(?:\\S+(?::\\S*)?@)?" +
+                    "(?:" +
+                    "(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
+                    "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
+                    "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
+                    "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
+                    "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
+                    "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
+                    "|" +
+                    "(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)" +
+                    "(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*" +
+                    "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
+                    ")" +
+                    "(?::\\d{2,5})?" +
+                    "(?:/[^\\s]*)?";
+
+    public final static String PATTERN_CONFIG_KEY = "URLPattern";
+
+    public RegexUrlExtractor() {
+        super(PATTERN_CONFIG_KEY, null, DEFAULT_PATTERN);
+    }
+
+    @Override
+    protected String prepareObject(String extracted) {
+        return extracted;
+    }
+
+    @Override
+    protected Collection<String> ensureTargetObject(Activity activity) {
+        return activity.getLinks();
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
new file mode 100644
index 0000000..38b8dab
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.pojo.json.Activity;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.*;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(Parameterized.class)
+public class RegexUrlExtractorTest {
+
+    private Activity activity;
+    private Set<String> links;
+
+    public RegexUrlExtractorTest(String activityContent, Set<String> links) {
+        this.activity = new Activity();
+        this.activity.setContent(activityContent);
+        this.links = links;
+    }
+
+    @Parameterized.Parameters
+    public static Collection<Object[]> params() {
+        return Arrays.asList(new Object[][]{
+                {"This is the http://t.co/foo of a standard tweet", Sets.newHashSet("http://t.co/foo")},
+                {"This is the https://t.co/foo of a standard tweet", Sets.newHashSet("https://t.co/foo")},
+                {"This is the http://amd.com/test of a standard tweet", Sets.newHashSet("http://amd.com/test")},
+                {"This is the content of a standard tweet", Sets.newHashSet()},
+                {"This is the http://www.google.com/articles/awesome?with=query&params=true of a standard @tweet",  Sets.newHashSet("http://www.google.com/articles/awesome?with=query&params=true")}
+        });
+    }
+
+    @Test
+    @SuppressWarnings("unchecked")
+    public void testExtraction() {
+        StreamsDatum datum = new StreamsDatum(activity, "Test");
+        List<StreamsDatum> result = new RegexUrlExtractor().process(datum);
+        assertThat(result.size(), is(equalTo(1)));
+        Activity output = (Activity)result.get(0).getDocument();
+        Set<String> extracted = Sets.newHashSet(output.getLinks());
+        Sets.SetView<String> diff = Sets.difference(links, extracted);
+        assertThat(diff.size(), is(equalTo(0)));
+    }
+}


[3/8] git commit: Added new module for regex processor

Posted by mf...@apache.org.
Added new module for regex processor


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/65822f28
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/65822f28
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/65822f28

Branch: refs/heads/master
Commit: 65822f280d82afa0e17f543343e867cdfc48a993
Parents: e683009
Author: mfranklin <mf...@apache.org>
Authored: Tue May 13 12:23:13 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:09 2014 -0400

----------------------------------------------------------------------
 streams-contrib/pom.xml                         |  1 +
 streams-contrib/streams-processor-regex/pom.xml | 34 ++++++++++++++++++++
 2 files changed, 35 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/65822f28/streams-contrib/pom.xml
----------------------------------------------------------------------
diff --git a/streams-contrib/pom.xml b/streams-contrib/pom.xml
index c7bbdf4..44e97c0 100644
--- a/streams-contrib/pom.xml
+++ b/streams-contrib/pom.xml
@@ -56,6 +56,7 @@
         <module>streams-provider-twitter</module>
         <module>streams-provider-sysomos</module>
         <module>streams-provider-rss</module>
+        <module>streams-processor-regex</module>
     </modules>
 
     <dependencyManagement>

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/65822f28/streams-contrib/streams-processor-regex/pom.xml
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/pom.xml b/streams-contrib/streams-processor-regex/pom.xml
new file mode 100644
index 0000000..094013d
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/pom.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~ or more contributor license agreements.  See the NOTICE file
+  ~ distributed with this work for additional information
+  ~ regarding copyright ownership.  The ASF licenses this file
+  ~ to you under the Apache License, Version 2.0 (the
+  ~ "License"); you may not use this file except in compliance
+  ~ with the License.  You may obtain a copy of the License at
+  ~
+  ~   http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing,
+  ~ software distributed under the License is distributed on an
+  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  ~ KIND, either express or implied.  See the License for the
+  ~ specific language governing permissions and limitations
+  ~ under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>streams-contrib</artifactId>
+        <groupId>org.apache.streams</groupId>
+        <version>0.1-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>streams-processor-regex</artifactId>
+
+
+</project>
\ No newline at end of file


[6/8] git commit: abstacted functionality

Posted by mf...@apache.org.
abstacted functionality


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/1bbfaca1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/1bbfaca1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/1bbfaca1

Branch: refs/heads/master
Commit: 1bbfaca1508031145cead0f7a142cfc1ef8e4522
Parents: 204977e
Author: mfranklin <mf...@apache.org>
Authored: Wed May 14 10:22:10 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:10 2014 -0400

----------------------------------------------------------------------
 .../regex/AbstractRegexExtensionExtractor.java  | 110 +++++++++++++++++++
 1 file changed, 110 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/1bbfaca1/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
new file mode 100644
index 0000000..6774962
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+import com.google.common.base.Strings;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.core.StreamsProcessor;
+import org.apache.streams.pojo.json.Activity;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
+
+/**
+ * Provides a base implementation for extracting entities from text using regular expressions and then
+ * modifying the appropriate {@link org.apache.streams.pojo.json.Activity} extensions object.
+ */
+public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProcessor {
+    private final String patternConfigKey;
+    private final String extensionKey;
+    private final String defaultPattern;
+
+    private String pattern;
+
+    protected AbstractRegexExtensionExtractor(String patternConfigKey, String extensionKey, String defaultPattern) {
+        this.patternConfigKey = patternConfigKey;
+        this.extensionKey = extensionKey;
+        this.defaultPattern = defaultPattern;
+    }
+
+    public String getPattern() {
+        return pattern;
+    }
+
+    @Override
+    public List<StreamsDatum> process(StreamsDatum entry) {
+        if(!(entry.getDocument() instanceof Activity)) {
+            return Lists.newArrayList();
+        }
+        if(Strings.isNullOrEmpty(pattern)) {
+            prepare(null);
+        }
+        Activity activity = (Activity)entry.getDocument();
+        Map<String, List<Integer>> matches = RegexUtils.extractMatches(pattern, activity.getContent());
+        Set<T> entities = ensureMentionExtension(activity);
+        for(String key : matches.keySet()) {
+            entities.add(prepareObject(key));
+        }
+        return Lists.newArrayList(entry);
+    }
+
+    @Override
+    public void prepare(Object configurationObject) {
+        if(configurationObject instanceof Map) {
+            if(((Map)configurationObject).containsKey(patternConfigKey)) {
+                pattern = (String)((Map)configurationObject).get(patternConfigKey);
+            }
+        } else if(configurationObject instanceof String) {
+            pattern = (String)configurationObject;
+        } else {
+            pattern = defaultPattern;
+        }
+    }
+
+    @Override
+    public void cleanUp() {
+        //NOP
+    }
+
+    /**
+     * Configures the value to be persisted to the extensions object
+     * @param extracted the value extracted by the regex
+     * @return an object representing the appropriate extension
+     */
+    protected abstract T prepareObject(String extracted);
+
+    @SuppressWarnings("unchecked")
+    protected Set<T> ensureMentionExtension(Activity activity) {
+        Map<String, Object> extensions = ensureExtensions(activity);
+        Set<T> hashtags;
+        if(extensions.containsKey(extensionKey)) {
+            hashtags = Sets.newHashSet((Iterable<T>) extensions.get(extensionKey));
+        } else {
+            hashtags = Sets.newHashSet();
+            extensions.put(extensionKey, hashtags);
+        }
+        return hashtags;
+    }
+}