You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@streams.apache.org by mf...@apache.org on 2014/05/14 17:31:23 UTC
[4/8] git commit: Added URL extractor
Added URL extractor
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/a1b02094
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/a1b02094
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/a1b02094
Branch: refs/heads/master
Commit: a1b02094ebd61233888635d6bcbe0ce383a6c009
Parents: 75578e9
Author: mfranklin <mf...@apache.org>
Authored: Wed May 14 11:18:06 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:10 2014 -0400
----------------------------------------------------------------------
.../regex/AbstractRegexExtensionExtractor.java | 5 +-
.../apache/streams/regex/RegexUrlExtractor.java | 68 +++++++++++++++++++
.../streams/regex/RegexUrlExtractorTest.java | 70 ++++++++++++++++++++
3 files changed, 141 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
index 6774962..23d1ad5 100644
--- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
@@ -26,6 +26,7 @@ import org.apache.streams.core.StreamsDatum;
import org.apache.streams.core.StreamsProcessor;
import org.apache.streams.pojo.json.Activity;
+import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -63,7 +64,7 @@ public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProce
}
Activity activity = (Activity)entry.getDocument();
Map<String, List<Integer>> matches = RegexUtils.extractMatches(pattern, activity.getContent());
- Set<T> entities = ensureMentionExtension(activity);
+ Collection<T> entities = ensureTargetObject(activity);
for(String key : matches.keySet()) {
entities.add(prepareObject(key));
}
@@ -96,7 +97,7 @@ public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProce
protected abstract T prepareObject(String extracted);
@SuppressWarnings("unchecked")
- protected Set<T> ensureMentionExtension(Activity activity) {
+ protected Collection<T> ensureTargetObject(Activity activity) {
Map<String, Object> extensions = ensureExtensions(activity);
Set<T> hashtags;
if(extensions.containsKey(extensionKey)) {
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
new file mode 100644
index 0000000..5d37b3a
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+import org.apache.streams.core.StreamsProcessor;
+import org.apache.streams.pojo.json.Activity;
+
+import java.util.Collection;
+
+/**
+ * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the URLs and add
+ * them to the appropriate extensions object
+ */
+public class RegexUrlExtractor extends AbstractRegexExtensionExtractor<String> implements StreamsProcessor {
+
+ //Temporarily copied from streams-processor-urls so as not to force a dependency on that provider. This should
+ //be moved to a common utility package
+ public final static String DEFAULT_PATTERN =
+ "(?:(?:https?|ftp)://)" +
+ "(?:\\S+(?::\\S*)?@)?" +
+ "(?:" +
+ "(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
+ "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
+ "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
+ "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
+ "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
+ "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
+ "|" +
+ "(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)" +
+ "(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*" +
+ "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
+ ")" +
+ "(?::\\d{2,5})?" +
+ "(?:/[^\\s]*)?";
+
+ public final static String PATTERN_CONFIG_KEY = "URLPattern";
+
+ public RegexUrlExtractor() {
+ super(PATTERN_CONFIG_KEY, null, DEFAULT_PATTERN);
+ }
+
+ @Override
+ protected String prepareObject(String extracted) {
+ return extracted;
+ }
+
+ @Override
+ protected Collection<String> ensureTargetObject(Activity activity) {
+ return activity.getLinks();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
new file mode 100644
index 0000000..38b8dab
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.pojo.json.Activity;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.*;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(Parameterized.class)
+public class RegexUrlExtractorTest {
+
+ private Activity activity;
+ private Set<String> links;
+
+ public RegexUrlExtractorTest(String activityContent, Set<String> links) {
+ this.activity = new Activity();
+ this.activity.setContent(activityContent);
+ this.links = links;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> params() {
+ return Arrays.asList(new Object[][]{
+ {"This is the http://t.co/foo of a standard tweet", Sets.newHashSet("http://t.co/foo")},
+ {"This is the https://t.co/foo of a standard tweet", Sets.newHashSet("https://t.co/foo")},
+ {"This is the http://amd.com/test of a standard tweet", Sets.newHashSet("http://amd.com/test")},
+ {"This is the content of a standard tweet", Sets.newHashSet()},
+ {"This is the http://www.google.com/articles/awesome?with=query¶ms=true of a standard @tweet", Sets.newHashSet("http://www.google.com/articles/awesome?with=query¶ms=true")}
+ });
+ }
+
+ @Test
+ @SuppressWarnings("unchecked")
+ public void testExtraction() {
+ StreamsDatum datum = new StreamsDatum(activity, "Test");
+ List<StreamsDatum> result = new RegexUrlExtractor().process(datum);
+ assertThat(result.size(), is(equalTo(1)));
+ Activity output = (Activity)result.get(0).getDocument();
+ Set<String> extracted = Sets.newHashSet(output.getLinks());
+ Sets.SetView<String> diff = Sets.difference(links, extracted);
+ assertThat(diff.size(), is(equalTo(0)));
+ }
+}