You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@streams.apache.org by mf...@apache.org on 2014/05/14 17:31:26 UTC
[7/8] git commit: Added hashtag processor
Added hashtag processor
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/204977ec
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/204977ec
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/204977ec
Branch: refs/heads/master
Commit: 204977ec87d039b12e89e51098d7137f19f5f7ab
Parents: 4ae10fd
Author: mfranklin <mf...@apache.org>
Authored: Tue May 13 16:10:55 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:10 2014 -0400
----------------------------------------------------------------------
streams-contrib/streams-processor-regex/pom.xml | 11 ++-
.../streams/regex/RegexHashtagExtractor.java | 97 ++++++++++++++++++++
.../org/apache/streams/regex/RegexUtils.java | 26 ++++--
.../regex/RegexHashtagExtractorTest.java | 72 +++++++++++++++
.../apache/streams/regex/RegexUtilsTest.java | 14 +--
5 files changed, 204 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/pom.xml
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/pom.xml b/streams-contrib/streams-processor-regex/pom.xml
index 094013d..57f661a 100644
--- a/streams-contrib/streams-processor-regex/pom.xml
+++ b/streams-contrib/streams-processor-regex/pom.xml
@@ -30,5 +30,14 @@
<artifactId>streams-processor-regex</artifactId>
-
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.streams</groupId>
+ <artifactId>streams-pojo</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.streams</groupId>
+ <artifactId>streams-core</artifactId>
+ </dependency>
+ </dependencies>
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
new file mode 100644
index 0000000..fe392b7
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+import com.google.common.base.Strings;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.core.StreamsProcessor;
+import org.apache.streams.pojo.json.Activity;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
+
+/**
+ * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the Hashtags and add
+ * them to the appropriate extensions object
+ */
+public class RegexHashtagExtractor implements StreamsProcessor{
+
+ public final static String DEFAULT_PATTERN = "#\\w+";
+ public final static String PATTERN_CONFIG_KEY = "HashtagPattern";
+ public final static String EXTENSION_KEY = "hashtags";
+
+ private String hashPattern;
+
+ public String getHashPattern() {
+ return hashPattern;
+ }
+
+ @Override
+ public List<StreamsDatum> process(StreamsDatum entry) {
+ if(!(entry.getDocument() instanceof Activity)) {
+ return Lists.newArrayList();
+ }
+ if(Strings.isNullOrEmpty(hashPattern)) {
+ prepare(null);
+ }
+ Activity activity = (Activity)entry.getDocument();
+ Map<String, List<Integer>> matches = RegexUtils.extractMatches(hashPattern, activity.getContent());
+ Set<String> hashtags = ensureHashtagsExtension(activity);
+ for(String key : matches.keySet()) {
+ hashtags.add(key.substring(1));
+ }
+ return Lists.newArrayList(entry);
+ }
+
+ @Override
+ public void prepare(Object configurationObject) {
+ if(configurationObject instanceof Map) {
+ if(((Map)configurationObject).containsKey(PATTERN_CONFIG_KEY)) {
+ hashPattern = (String)((Map)configurationObject).get(PATTERN_CONFIG_KEY);
+ }
+ } else if(configurationObject instanceof String) {
+ hashPattern = (String)configurationObject;
+ } else {
+ hashPattern = DEFAULT_PATTERN;
+ }
+ }
+
+ @Override
+ public void cleanUp() {
+ //NOP
+ }
+
+ protected Set<String> ensureHashtagsExtension(Activity activity) {
+ Map<String, Object> extensions = ensureExtensions(activity);
+ Set<String> hashtags;
+ if(extensions.containsKey(EXTENSION_KEY)) {
+ hashtags = Sets.newHashSet((Iterable<String>) extensions.get(EXTENSION_KEY));
+ } else {
+ hashtags = Sets.newHashSet();
+ extensions.put(EXTENSION_KEY, hashtags);
+ }
+ return hashtags;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
index 41c3ee5..662fc98 100644
--- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
@@ -19,10 +19,11 @@
package org.apache.streams.regex;
-import java.util.LinkedList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
import java.util.List;
import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -31,7 +32,7 @@ import java.util.regex.Pattern;
*/
public class RegexUtils {
- private static final Map<String, Pattern> patternCache = new ConcurrentHashMap<String, Pattern>();
+ private static final Map<String, Pattern> patternCache = Maps.newConcurrentMap();
private RegexUtils() {}
@@ -41,7 +42,7 @@ public class RegexUtils {
* @param content the complete content to find matches in.
* @return a non-null list of matches.
*/
- public static List<String> extractMatches(String pattern, String content) {
+ public static Map<String, List<Integer>> extractMatches(String pattern, String content) {
return getMatches(pattern, content, -1);
}
@@ -51,21 +52,28 @@ public class RegexUtils {
* @param content the complete content to find matches in.
* @return a non-null list of matches.
*/
- public static List<String> extractWordMatches(String pattern, String content) {
+ public static Map<String, List<Integer>> extractWordMatches(String pattern, String content) {
pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,?]|$)";
return getMatches(pattern, content, 2);
}
- protected static List<String> getMatches(String pattern, String content, int capture) {
+ protected static Map<String, List<Integer>> getMatches(String pattern, String content, int capture) {
Matcher m = getPattern(pattern).matcher(content);
- List<String> result = new LinkedList<String>();
+ Map<String, List<Integer>> matches = Maps.newHashMap();
while(m.find()) {
String group = capture > 0 ? m.group(capture) : m.group();
if(group != null && !group.equals("")) {
- result.add(group);
+ List<Integer> indices;
+ if(matches.containsKey(group)) {
+ indices = matches.get(group);
+ } else {
+ indices = Lists.newArrayList();
+ matches.put(group, indices);
+ }
+ indices.add(m.start());
}
}
- return result;
+ return matches;
}
private static Pattern getPattern(String pattern) {
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
new file mode 100644
index 0000000..d2912b7
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.pojo.json.Activity;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+
+import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(Parameterized.class)
+public class RegexHashtagExtractorTest {
+
+ private Activity activity;
+ private Set<String> hashtags;
+
+ public RegexHashtagExtractorTest(String activityContent, Set<String> hashtags) {
+ this.activity = new Activity();
+ this.activity.setContent(activityContent);
+ this.hashtags = hashtags;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> params() {
+ return Arrays.asList(new Object[][]{
+ {"This is the #content of a standard tweet", Sets.newHashSet("content")},
+ {"This is the content of a standard tweet", Sets.newHashSet()},
+ {"This is the #content of a standard #tweet", Sets.newHashSet("content", "tweet")},
+ {"This is the body of a #fbpost. It can have multiple lines of #content, as well as much more detailed and flowery #language.", Sets.newHashSet("content", "fbpost", "language")}
+ });
+ }
+
+ @Test
+ public void testExtraction() {
+ StreamsDatum datum = new StreamsDatum(activity, "Test");
+ List<StreamsDatum> result = new RegexHashtagExtractor().process(datum);
+ assertThat(result.size(), is(equalTo(1)));
+ Activity output = (Activity)result.get(0).getDocument();
+ Set<String> extracted = (Set) ensureExtensions(output).get(RegexHashtagExtractor.EXTENSION_KEY);
+ Sets.SetView<String> diff = Sets.difference(extracted, hashtags);
+ assertThat(diff.size(), is(equalTo(0)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
index eed1327..fc2b9f6 100644
--- a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
@@ -27,6 +27,7 @@ import org.junit.runners.Parameterized;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
+import java.util.Map;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.is;
@@ -56,10 +57,11 @@ public class RegexUtilsTest {
{"#\\w+", "This is #freakingcrazydude.", 1, 1},
{"#\\w+", "This is #freakingcrazydude!", 1, 1},
{"#\\w+", "This is #freakingcrazydude I went to the moon", 1, 1},
- {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude? party", 2, 2},
- {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude; party", 2, 2},
- {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude, party", 2, 2},
- {"#\\w+", "This is#freakingcrazydude I went to the #freakingcrazydude party", 2, 1},
+ {"#\\w+", "This is #freakingcrazydude I went to the #crazy? party", 2, 2},
+ {"#\\w+", "This is #freakingcrazydude I went to the #crazy; party", 2, 2},
+ {"#\\w+", "This is #freakingcrazydude I went to the #crazy, party", 2, 2},
+ {"#\\w+", "This is#freakingcrazydude I went to the #crazy party", 2, 1},
+ {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude party", 1, 1},
{"#\\w+", "#what does the fox say?", 1, 1},
{"#\\w+", "#what does the fox #say", 2, 2}
});
@@ -68,10 +70,10 @@ public class RegexUtilsTest {
@Test
public void testMatches_simple() {
- List<String> wordResults = RegexUtils.extractWordMatches(this.pattern, this.content);
+ Map<String, List<Integer>> wordResults = RegexUtils.extractWordMatches(this.pattern, this.content);
assertThat(wordResults.size(), is(equalTo(wordMatchCount)));
- List<String> regularResults = RegexUtils.extractMatches(this.pattern, this.content);
+ Map<String, List<Integer>> regularResults = RegexUtils.extractMatches(this.pattern, this.content);
assertThat(regularResults.size(), is(equalTo(regularMatchCount)));
}