You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/02/23 13:32:50 UTC

svn commit: r1292764 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/urlfilter-domainblacklist/ src/plugin/urlfilter-domainblacklist/data/ src/plugin/urlfilter-domainblacklist/src/ src/plugin/urlfilter-domainblacklist/src/java/ src/plugin/urlfilter-...

Author: markus
Date: Thu Feb 23 12:32:49 2012
New Revision: 1292764

URL: http://svn.apache.org/viewvc?rev=1292764&view=rev
Log:
NUTCH-1210 Domain Blacklist Filter

Added:
    nutch/trunk/conf/domainblacklist-urlfilter.txt
    nutch/trunk/src/plugin/urlfilter-domainblacklist/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
    nutch/trunk/src/plugin/urlfilter-domainblacklist/data/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
    nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
    nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/
    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1292764&r1=1292763&r2=1292764&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 23 12:32:49 2012
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1210 DomainBlacklistFilter (markus)
+
 * NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
 
 * NUTCH-1193 Incorrect url transform to lowercase: parameter solr (Eduardo dos Santos Leggiero via lewismc)

Added: nutch/trunk/conf/domainblacklist-urlfilter.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/domainblacklist-urlfilter.txt?rev=1292764&view=auto
==============================================================================
--- nutch/trunk/conf/domainblacklist-urlfilter.txt (added)
+++ nutch/trunk/conf/domainblacklist-urlfilter.txt Thu Feb 23 12:32:49 2012
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# config file for urlfilter-domainblacklist plugin

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1292764&r1=1292763&r2=1292764&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Feb 23 12:32:49 2012
@@ -57,6 +57,7 @@
      <ant dir="tld" target="deploy"/>
      <ant dir="urlfilter-automaton" target="deploy"/>
      <ant dir="urlfilter-domain" target="deploy" />
+     <ant dir="urlfilter-domainblacklist" target="deploy" />
      <ant dir="urlfilter-prefix" target="deploy"/>
      <ant dir="urlfilter-regex" target="deploy"/>
      <ant dir="urlfilter-suffix" target="deploy"/>
@@ -132,6 +133,7 @@
     <ant dir="tld" target="clean"/>
     <ant dir="urlfilter-automaton" target="clean"/>
     <ant dir="urlfilter-domain" target="clean" />
+    <ant dir="urlfilter-domainblacklist" target="clean" />
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>
     <ant dir="urlfilter-suffix" target="clean"/>

Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml?rev=1292764&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml (added)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml Thu Feb 23 12:32:49 2012
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domainblacklist" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt?rev=1292764&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt (added)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt Thu Feb 23 12:32:49 2012
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file

Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml?rev=1292764&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml (added)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml Thu Feb 23 12:32:49 2012
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml?rev=1292764&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml (added)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml Thu Feb 23 12:32:49 2012
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-domainblacklist"
+   name="Domain Blacklist URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-domainblacklist.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
+              name="Nutch Domain Blacklist URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="DomainBlacklistURLFilter"
+        class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter">
+        <parameter name="file" value="domainblacklist-urlfilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java?rev=1292764&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java (added)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java Thu Feb 23 12:32:49 2012
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * <p>Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. A url that matches one of the suffixes, domains, or hosts
+ * present in the file is filtered out.</p>
+ * 
+ * <p>Urls are checked in order of domain suffix, domain name, and hostname
+ * against entries in the domain file. The domain file would be setup as follows
+ * with one entry per line:
+ * 
+ * <pre> com apache.org www.apache.org </pre>
+ * 
+ * <p>The first line is an example of a filter that would allow all .com
+ * domains. The second line allows all urls from apache.org and all of its
+ * subdomains such as lucene.apache.org and hadoop.apache.org. The third line
+ * would allow only urls from www.apache.org. There is no specific ordering to
+ * entries. The entries are from more general to more specific with the more
+ * general overridding the more specific.</p>
+ * 
+ * The domain file defaults to domainblacklist-urlfilter.txt in the classpath but can be
+ * overridden using the:
+ * 
+ * <ul> <ol>property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and</ol>
+ * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
+ * 
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainBlacklistURLFilter
+  implements URLFilter {
+
+  private static final Logger LOG = LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+  private Configuration conf;
+  private String domainFile = null;
+  private Set<String> domainSet = new LinkedHashSet<String>();
+
+  private void readConfiguration(Reader configReader)
+    throws IOException {
+
+    // read the configuration file, line by line
+    BufferedReader reader = new BufferedReader(configReader);
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // add non-blank lines and non-commented lines
+        domainSet.add(StringUtils.lowerCase(line));
+      }
+    }
+  }
+
+  /**
+   * Default constructor.
+   */
+  public DomainBlacklistURLFilter() {
+
+  }
+
+  /**
+   * Constructor that specifies the domain file to use.
+   * 
+   * @param domainFile The domain file, overrides domainblacklist-urlfilter.text default.
+   * 
+   * @throws IOException
+   */
+  public DomainBlacklistURLFilter(String domainFile) {
+    this.domainFile = domainFile;
+  }
+
+  /**
+   * Sets the configuration.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlfilter-domainblacklist";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+      URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+    
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+    
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+      }
+    }
+    else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+          + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlfilter.domainblacklist.file");    
+    String stringRules = conf.get("urlfilter.domainblacklist.rules");
+    if (domainFile != null) {
+      file = domainFile;
+    }
+    else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    }
+    catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public String filter(String url) {
+
+    try {
+
+      // match for suffix, domain, and host in that order.  more general will
+      // override more specific
+      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+      String host = URLUtil.getHost(url);
+      String suffix = null;
+      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+      if (domainSuffix != null) {
+        suffix = domainSuffix.getDomain();
+      }
+      
+      if (domainSet.contains(suffix) || domainSet.contains(domain)
+        || domainSet.contains(host)) {
+        // Matches, filter!
+        return null;
+      }
+
+      // doesn't match, allow
+      return url;
+    }
+    catch (Exception e) {
+      
+      // if an error happens, allow the url to pass
+      LOG.error("Could not apply filter on url: " + url + "\n"
+        + org.apache.hadoop.util.StringUtils.stringifyException(e));
+      return null;
+    }
+  }
+}

Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java?rev=1292764&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java (added)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java Thu Feb 23 12:32:49 2012
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import junit.framework.TestCase;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestDomainBlacklistURLFilter
+  extends TestCase {
+
+  protected static final Logger LOG = LoggerFactory.getLogger(TestDomainBlacklistURLFilter.class);
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  public TestDomainBlacklistURLFilter(String testName) {
+    super(testName);
+  }
+
+  public void testFilter()
+    throws Exception {
+
+    String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile);
+    domainBlacklistFilter.setConf(conf);
+    assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
+    assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
+    assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
+    assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
+    assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
+    assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
+    assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
+    assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
+    assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
+    assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
+  }
+
+}