You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@streams.apache.org by mf...@apache.org on 2014/05/14 17:31:23 UTC

[4/8] git commit: Added URL extractor

Added URL extractor


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/a1b02094
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/a1b02094
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/a1b02094

Branch: refs/heads/master
Commit: a1b02094ebd61233888635d6bcbe0ce383a6c009
Parents: 75578e9
Author: mfranklin <mf...@apache.org>
Authored: Wed May 14 11:18:06 2014 -0400
Committer: mfranklin <mf...@apache.org>
Committed: Wed May 14 11:31:10 2014 -0400

----------------------------------------------------------------------
 .../regex/AbstractRegexExtensionExtractor.java  |  5 +-
 .../apache/streams/regex/RegexUrlExtractor.java | 68 +++++++++++++++++++
 .../streams/regex/RegexUrlExtractorTest.java    | 70 ++++++++++++++++++++
 3 files changed, 141 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
index 6774962..23d1ad5 100644
--- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java
@@ -26,6 +26,7 @@ import org.apache.streams.core.StreamsDatum;
 import org.apache.streams.core.StreamsProcessor;
 import org.apache.streams.pojo.json.Activity;
 
+import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -63,7 +64,7 @@ public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProce
         }
         Activity activity = (Activity)entry.getDocument();
         Map<String, List<Integer>> matches = RegexUtils.extractMatches(pattern, activity.getContent());
-        Set<T> entities = ensureMentionExtension(activity);
+        Collection<T> entities = ensureTargetObject(activity);
         for(String key : matches.keySet()) {
             entities.add(prepareObject(key));
         }
@@ -96,7 +97,7 @@ public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProce
     protected abstract T prepareObject(String extracted);
 
     @SuppressWarnings("unchecked")
-    protected Set<T> ensureMentionExtension(Activity activity) {
+    protected Collection<T> ensureTargetObject(Activity activity) {
         Map<String, Object> extensions = ensureExtensions(activity);
         Set<T> hashtags;
         if(extensions.containsKey(extensionKey)) {

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
new file mode 100644
index 0000000..5d37b3a
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+import org.apache.streams.core.StreamsProcessor;
+import org.apache.streams.pojo.json.Activity;
+
+import java.util.Collection;
+
+/**
+ * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the URLs and add
+ * them to the appropriate extensions object
+ */
+public class RegexUrlExtractor extends AbstractRegexExtensionExtractor<String> implements StreamsProcessor {
+
+    //Temporarily copied from streams-processor-urls so as not to force a dependency on that provider.  This should
+    //be moved to a common utility package
+    public final static String DEFAULT_PATTERN =
+            "(?:(?:https?|ftp)://)" +
+                    "(?:\\S+(?::\\S*)?@)?" +
+                    "(?:" +
+                    "(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
+                    "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
+                    "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
+                    "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
+                    "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
+                    "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
+                    "|" +
+                    "(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)" +
+                    "(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*" +
+                    "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
+                    ")" +
+                    "(?::\\d{2,5})?" +
+                    "(?:/[^\\s]*)?";
+
+    public final static String PATTERN_CONFIG_KEY = "URLPattern";
+
+    public RegexUrlExtractor() {
+        super(PATTERN_CONFIG_KEY, null, DEFAULT_PATTERN);
+    }
+
+    @Override
+    protected String prepareObject(String extracted) {
+        return extracted;
+    }
+
+    @Override
+    protected Collection<String> ensureTargetObject(Activity activity) {
+        return activity.getLinks();
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
new file mode 100644
index 0000000..38b8dab
--- /dev/null
+++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.pojo.json.Activity;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.*;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(Parameterized.class)
+public class RegexUrlExtractorTest {
+
+    private Activity activity;
+    private Set<String> links;
+
+    public RegexUrlExtractorTest(String activityContent, Set<String> links) {
+        this.activity = new Activity();
+        this.activity.setContent(activityContent);
+        this.links = links;
+    }
+
+    @Parameterized.Parameters
+    public static Collection<Object[]> params() {
+        return Arrays.asList(new Object[][]{
+                {"This is the http://t.co/foo of a standard tweet", Sets.newHashSet("http://t.co/foo")},
+                {"This is the https://t.co/foo of a standard tweet", Sets.newHashSet("https://t.co/foo")},
+                {"This is the http://amd.com/test of a standard tweet", Sets.newHashSet("http://amd.com/test")},
+                {"This is the content of a standard tweet", Sets.newHashSet()},
+                {"This is the http://www.google.com/articles/awesome?with=query&params=true of a standard @tweet",  Sets.newHashSet("http://www.google.com/articles/awesome?with=query&params=true")}
+        });
+    }
+
+    @Test
+    @SuppressWarnings("unchecked")
+    public void testExtraction() {
+        StreamsDatum datum = new StreamsDatum(activity, "Test");
+        List<StreamsDatum> result = new RegexUrlExtractor().process(datum);
+        assertThat(result.size(), is(equalTo(1)));
+        Activity output = (Activity)result.get(0).getDocument();
+        Set<String> extracted = Sets.newHashSet(output.getLinks());
+        Sets.SetView<String> diff = Sets.difference(links, extracted);
+        assertThat(diff.size(), is(equalTo(0)));
+    }
+}