You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@nutch.apache.org by Lewis John Mcgibbney <le...@gmail.com> on 2012/02/23 13:36:35 UTC

Re: svn commit: r1292764 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/urlfilter-domainblacklist/ src/plugin/urlfilter-domainblacklist/data/ src/plugin/urlfilter-domainblacklist/src/ src/plugin/urlfilter-domainblacklist/src/java/ src/plugin/urlf

Hey Markus,

Great work with this one.

I notice that you did not add

<ant dir="urlfilter-domainblacklist" target="test" />

to nutch/trunk/src/plugin/build.xml

Lewis

On Thu, Feb 23, 2012 at 12:32 PM, <ma...@apache.org> wrote:

> Author: markus
> Date: Thu Feb 23 12:32:49 2012
> New Revision: 1292764
>
> URL: http://svn.apache.org/viewvc?rev=1292764&view=rev
> Log:
> NUTCH-1210 Domain Blacklist Filter
>
> Added:
>    nutch/trunk/conf/domainblacklist-urlfilter.txt
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/data/
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/
>
>  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/
>
>  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/
>
>  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/
>
>  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/
>    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/
>
>  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/
>
>  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/
>
>  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/
>
>  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> Modified:
>    nutch/trunk/CHANGES.txt
>    nutch/trunk/src/plugin/build.xml
>
> Modified: nutch/trunk/CHANGES.txt
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1292764&r1=1292763&r2=1292764&view=diff
>
> ==============================================================================
> --- nutch/trunk/CHANGES.txt (original)
> +++ nutch/trunk/CHANGES.txt Thu Feb 23 12:32:49 2012
> @@ -1,5 +1,7 @@
>  Nutch Change Log
>
> +* NUTCH-1210 DomainBlacklistFilter (markus)
> +
>  * NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
>
>  * NUTCH-1193 Incorrect url transform to lowercase: parameter solr
> (Eduardo dos Santos Leggiero via lewismc)
>
> Added: nutch/trunk/conf/domainblacklist-urlfilter.txt
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/conf/domainblacklist-urlfilter.txt?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/conf/domainblacklist-urlfilter.txt (added)
> +++ nutch/trunk/conf/domainblacklist-urlfilter.txt Thu Feb 23 12:32:49 2012
> @@ -0,0 +1,16 @@
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements.  See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to You under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License.  You may obtain a copy of the License at
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +
> +# config file for urlfilter-domainblacklist plugin
>
> Modified: nutch/trunk/src/plugin/build.xml
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1292764&r1=1292763&r2=1292764&view=diff
>
> ==============================================================================
> --- nutch/trunk/src/plugin/build.xml (original)
> +++ nutch/trunk/src/plugin/build.xml Thu Feb 23 12:32:49 2012
> @@ -57,6 +57,7 @@
>      <ant dir="tld" target="deploy"/>
>      <ant dir="urlfilter-automaton" target="deploy"/>
>      <ant dir="urlfilter-domain" target="deploy" />
> +     <ant dir="urlfilter-domainblacklist" target="deploy" />
>      <ant dir="urlfilter-prefix" target="deploy"/>
>      <ant dir="urlfilter-regex" target="deploy"/>
>      <ant dir="urlfilter-suffix" target="deploy"/>
> @@ -132,6 +133,7 @@
>     <ant dir="tld" target="clean"/>
>     <ant dir="urlfilter-automaton" target="clean"/>
>     <ant dir="urlfilter-domain" target="clean" />
> +    <ant dir="urlfilter-domainblacklist" target="clean" />
>     <ant dir="urlfilter-prefix" target="clean"/>
>     <ant dir="urlfilter-regex" target="clean"/>
>     <ant dir="urlfilter-suffix" target="clean"/>
>
> Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml (added)
> +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml Thu Feb 23
> 12:32:49 2012
> @@ -0,0 +1,28 @@
> +<?xml version="1.0"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements.  See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License.  You may obtain a copy of the License at
> +
> +     http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<project name="urlfilter-domainblacklist" default="jar-core">
> +
> +  <import file="../build-plugin.xml"/>
> +
> +  <!-- for junit test -->
> +  <mkdir dir="${build.test}/data"/>
> +  <copy todir="${build.test}/data">
> +    <fileset dir="data" />
> +  </copy>
> +
> +</project>
>
> Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt (added)
> +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt Thu
> Feb 23 12:32:49 2012
> @@ -0,0 +1,5 @@
> +# comments start with the pound sign
> +net
> +apache.org
> +be
> +www.yahoo.com
> \ No newline at end of file
>
> Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml (added)
> +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml Thu Feb 23
> 12:32:49 2012
> @@ -0,0 +1,41 @@
> +<?xml version="1.0" ?>
> +
> +<!--
> +   Licensed to the Apache Software Foundation (ASF) under one or more
> +   contributor license agreements.  See the NOTICE file distributed with
> +   this work for additional information regarding copyright ownership.
> +   The ASF licenses this file to You under the Apache License, Version 2.0
> +   (the "License"); you may not use this file except in compliance with
> +   the License.  You may obtain a copy of the License at
> +
> +       http://www.apache.org/licenses/LICENSE-2.0
> +
> +   Unless required by applicable law or agreed to in writing, software
> +   distributed under the License is distributed on an "AS IS" BASIS,
> +   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> +   See the License for the specific language governing permissions and
> +   limitations under the License.
> +-->
> +
> +<ivy-module version="1.0">
> +  <info organisation="org.apache.nutch" module="${ant.project.name}">
> +    <license name="Apache 2.0"/>
> +    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
> +    <description>
> +        Apache Nutch
> +    </description>
> +  </info>
> +
> +  <configurations>
> +    <include file="../../../ivy/ivy-configurations.xml"/>
> +  </configurations>
> +
> +  <publications>
> +    <!--get the artifact from our module name-->
> +    <artifact conf="master"/>
> +  </publications>
> +
> +  <dependencies>
> +  </dependencies>
> +
> +</ivy-module>
>
> Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml (added)
> +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml Thu Feb 23
> 12:32:49 2012
> @@ -0,0 +1,43 @@
> +<?xml version="1.0" encoding="UTF-8"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements.  See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License.  You may obtain a copy of the License at
> +
> +     http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<plugin
> +   id="urlfilter-domainblacklist"
> +   name="Domain Blacklist URL Filter"
> +   version="1.0.0"
> +   provider-name="nutch.org">
> +
> +   <runtime>
> +      <library name="urlfilter-domainblacklist.jar">
> +         <export name="*"/>
> +      </library>
> +   </runtime>
> +
> +   <requires>
> +      <import plugin="nutch-extensionpoints"/>
> +   </requires>
> +
> +   <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
> +              name="Nutch Domain Blacklist URL Filter"
> +              point="org.apache.nutch.net.URLFilter">
> +      <implementation id="DomainBlacklistURLFilter"
> +
>  class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter">
> +        <parameter name="file" value="domainblacklist-urlfilter.txt"/>
> +      </implementation>
> +   </extension>
> +
> +</plugin>
>
> Added:
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java?rev=1292764&view=auto
>
> ==============================================================================
> ---
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> (added)
> +++
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> Thu Feb 23 12:32:49 2012
> @@ -0,0 +1,203 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.nutch.urlfilter.domainblacklist;
> +
> +import java.io.BufferedReader;
> +import java.io.FileReader;
> +import java.io.IOException;
> +import java.io.Reader;
> +import java.io.StringReader;
> +import java.util.LinkedHashSet;
> +import java.util.Set;
> +
> +import org.apache.commons.lang.StringUtils;
> +import org.slf4j.Logger;
> +import org.slf4j.LoggerFactory;
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.nutch.net.URLFilter;
> +import org.apache.nutch.plugin.Extension;
> +import org.apache.nutch.plugin.PluginRepository;
> +import org.apache.nutch.util.URLUtil;
> +import org.apache.nutch.util.domain.DomainSuffix;
> +
> +/**
> + * <p>Filters URLs based on a file containing domain suffixes, domain
> names, and
> + * hostnames. A url that matches one of the suffixes, domains, or hosts
> + * present in the file is filtered out.</p>
> + *
> + * <p>Urls are checked in order of domain suffix, domain name, and
> hostname
> + * against entries in the domain file. The domain file would be setup as
> follows
> + * with one entry per line:
> + *
> + * <pre> com apache.org www.apache.org </pre>
> + *
> + * <p>The first line is an example of a filter that would allow all .com
> + * domains. The second line allows all urls from apache.org and all of
> its
> + * subdomains such as lucene.apache.org and hadoop.apache.org. The third
> line
> + * would allow only urls from www.apache.org. There is no specific
> ordering to
> + * entries. The entries are from more general to more specific with the
> more
> + * general overridding the more specific.</p>
> + *
> + * The domain file defaults to domainblacklist-urlfilter.txt in the
> classpath but can be
> + * overridden using the:
> + *
> + * <ul> <ol>property "urlfilter.domainblacklist.file" in
> ./conf/nutch-*.xml, and</ol>
> + * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
> + *
> + * the attribute "file" has higher precedence if defined.
> + */
> +public class DomainBlacklistURLFilter
> +  implements URLFilter {
> +
> +  private static final Logger LOG =
> LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
> +
> +  // read in attribute "file" of this plugin.
> +  private static String attributeFile = null;
> +  private Configuration conf;
> +  private String domainFile = null;
> +  private Set<String> domainSet = new LinkedHashSet<String>();
> +
> +  private void readConfiguration(Reader configReader)
> +    throws IOException {
> +
> +    // read the configuration file, line by line
> +    BufferedReader reader = new BufferedReader(configReader);
> +    String line = null;
> +    while ((line = reader.readLine()) != null) {
> +      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
> +        // add non-blank lines and non-commented lines
> +        domainSet.add(StringUtils.lowerCase(line));
> +      }
> +    }
> +  }
> +
> +  /**
> +   * Default constructor.
> +   */
> +  public DomainBlacklistURLFilter() {
> +
> +  }
> +
> +  /**
> +   * Constructor that specifies the domain file to use.
> +   *
> +   * @param domainFile The domain file, overrides
> domainblacklist-urlfilter.text default.
> +   *
> +   * @throws IOException
> +   */
> +  public DomainBlacklistURLFilter(String domainFile) {
> +    this.domainFile = domainFile;
> +  }
> +
> +  /**
> +   * Sets the configuration.
> +   */
> +  public void setConf(Configuration conf) {
> +    this.conf = conf;
> +
> +    // get the extensions for domain urlfilter
> +    String pluginName = "urlfilter-domainblacklist";
> +    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
> +      URLFilter.class.getName()).getExtensions();
> +    for (int i = 0; i < extensions.length; i++) {
> +      Extension extension = extensions[i];
> +      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
> +        attributeFile = extension.getAttribute("file");
> +        break;
> +      }
> +    }
> +
> +    // handle blank non empty input
> +    if (attributeFile != null && attributeFile.trim().equals("")) {
> +      attributeFile = null;
> +    }
> +
> +    if (attributeFile != null) {
> +      if (LOG.isInfoEnabled()) {
> +        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
> +          + " as " + attributeFile);
> +      }
> +    }
> +    else {
> +      if (LOG.isWarnEnabled()) {
> +        LOG.warn("Attribute \"file\" is not defined in plugin.xml for
> plugin "
> +          + pluginName);
> +      }
> +    }
> +
> +    // domain file and attribute "file" take precedence if defined
> +    String file = conf.get("urlfilter.domainblacklist.file");
> +    String stringRules = conf.get("urlfilter.domainblacklist.rules");
> +    if (domainFile != null) {
> +      file = domainFile;
> +    }
> +    else if (attributeFile != null) {
> +      file = attributeFile;
> +    }
> +    Reader reader = null;
> +    if (stringRules != null) { // takes precedence over files
> +      reader = new StringReader(stringRules);
> +    } else {
> +      reader = conf.getConfResourceAsReader(file);
> +    }
> +    try {
> +      if (reader == null) {
> +        reader = new FileReader(file);
> +      }
> +      readConfiguration(reader);
> +    }
> +    catch (IOException e) {
> +      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
> +    }
> +  }
> +
> +  public Configuration getConf() {
> +    return this.conf;
> +  }
> +
> +  public String filter(String url) {
> +
> +    try {
> +
> +      // match for suffix, domain, and host in that order.  more general
> will
> +      // override more specific
> +      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
> +      String host = URLUtil.getHost(url);
> +      String suffix = null;
> +      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
> +      if (domainSuffix != null) {
> +        suffix = domainSuffix.getDomain();
> +      }
> +
> +      if (domainSet.contains(suffix) || domainSet.contains(domain)
> +        || domainSet.contains(host)) {
> +        // Matches, filter!
> +        return null;
> +      }
> +
> +      // doesn't match, allow
> +      return url;
> +    }
> +    catch (Exception e) {
> +
> +      // if an error happens, allow the url to pass
> +      LOG.error("Could not apply filter on url: " + url + "\n"
> +        + org.apache.hadoop.util.StringUtils.stringifyException(e));
> +      return null;
> +    }
> +  }
> +}
>
> Added:
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java?rev=1292764&view=auto
>
> ==============================================================================
> ---
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> (added)
> +++
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> Thu Feb 23 12:32:49 2012
> @@ -0,0 +1,57 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.nutch.urlfilter.domainblacklist;
> +
> +import junit.framework.TestCase;
> +
> +import org.slf4j.Logger;
> +import org.slf4j.LoggerFactory;
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.nutch.util.NutchConfiguration;
> +
> +public class TestDomainBlacklistURLFilter
> +  extends TestCase {
> +
> +  protected static final Logger LOG =
> LoggerFactory.getLogger(TestDomainBlacklistURLFilter.class);
> +
> +  private final static String SEPARATOR =
> System.getProperty("file.separator");
> +  private final static String SAMPLES = System.getProperty("test.data",
> ".");
> +
> +  public TestDomainBlacklistURLFilter(String testName) {
> +    super(testName);
> +  }
> +
> +  public void testFilter()
> +    throws Exception {
> +
> +    String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
> +    Configuration conf = NutchConfiguration.create();
> +    DomainBlacklistURLFilter domainBlacklistFilter = new
> DomainBlacklistURLFilter(domainBlacklistFile);
> +    domainBlacklistFilter.setConf(conf);
> +    assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
> +    assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
> +    assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
> +    assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
> +    assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
> +    assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
> +    assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
> +    assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
> +    assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
> +    assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
> +  }
> +
> +}
>
>
>


-- 
*Lewis*

Re: svn commit: r1292764 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/urlfilter-domainblacklist/ src/plugin/urlfilter-domainblacklist/data/ src/plugin/urlfilter-domainblacklist/src/ src/plugin/urlfilter-domainblacklist/src/java/ src/plugin/urlf

Posted by Markus Jelsma <ma...@openindex.io>.
I didn't? I explicitly added it. I'll check again and commit if i have to. 
Thanks


On Thursday 23 February 2012 13:36:35 Lewis John Mcgibbney wrote:
> Hey Markus,
> 
> Great work with this one.
> 
> I notice that you did not add
> 
> <ant dir="urlfilter-domainblacklist" target="test" />
> 
> to nutch/trunk/src/plugin/build.xml
> 
> Lewis
> 
> On Thu, Feb 23, 2012 at 12:32 PM, <ma...@apache.org> wrote:
> > Author: markus
> > Date: Thu Feb 23 12:32:49 2012
> > New Revision: 1292764
> > 
> > URL: http://svn.apache.org/viewvc?rev=1292764&view=rev
> > Log:
> > NUTCH-1210 Domain Blacklist Filter
> > 
> > Added:
> >    nutch/trunk/conf/domainblacklist-urlfilter.txt
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/data/
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/
> >  
> >  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nut
> >  ch/
> >  
> >  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nut
> >  ch/urlfilter/
> >  
> >  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nut
> >  ch/urlfilter/domainblacklist/
> >  
> >  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nut
> >  ch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> >  
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/
> >    nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/
> >  
> >  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nut
> >  ch/
> >  
> >  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nut
> >  ch/urlfilter/
> >  
> >  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nut
> >  ch/urlfilter/domainblacklist/
> >  
> >  nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nut
> >  ch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> > 
> > Modified:
> >    nutch/trunk/CHANGES.txt
> >    nutch/trunk/src/plugin/build.xml
> > 
> > Modified: nutch/trunk/CHANGES.txt
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1292764&r1=12927
> > 63&r2=1292764&view=diff
> > 
> > =========================================================================
> > ===== --- nutch/trunk/CHANGES.txt (original)
> > +++ nutch/trunk/CHANGES.txt Thu Feb 23 12:32:49 2012
> > @@ -1,5 +1,7 @@
> > 
> >  Nutch Change Log
> > 
> > +* NUTCH-1210 DomainBlacklistFilter (markus)
> > +
> > 
> >  * NUTCH-965 Skip parsing for truncated documents (alexis, lewismc,
> >  ferdy)
> >  
> >  * NUTCH-1193 Incorrect url transform to lowercase: parameter solr
> > 
> > (Eduardo dos Santos Leggiero via lewismc)
> > 
> > Added: nutch/trunk/conf/domainblacklist-urlfilter.txt
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/conf/domainblacklist-urlfilter.t
> > xt?rev=1292764&view=auto
> > 
> > =========================================================================
> > ===== --- nutch/trunk/conf/domainblacklist-urlfilter.txt (added)
> > +++ nutch/trunk/conf/domainblacklist-urlfilter.txt Thu Feb 23 12:32:49
> > 2012 @@ -0,0 +1,16 @@
> > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > +# contributor license agreements.  See the NOTICE file distributed with
> > +# this work for additional information regarding copyright ownership.
> > +# The ASF licenses this file to You under the Apache License, Version
> > 2.0 +# (the "License"); you may not use this file except in compliance
> > with +# the License.  You may obtain a copy of the License at
> > +#
> > +#     http://www.apache.org/licenses/LICENSE-2.0
> > +#
> > +# Unless required by applicable law or agreed to in writing, software
> > +# distributed under the License is distributed on an "AS IS" BASIS,
> > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied. +# See the License for the specific language governing
> > permissions and +# limitations under the License.
> > +
> > +# config file for urlfilter-domainblacklist plugin
> > 
> > Modified: nutch/trunk/src/plugin/build.xml
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1292764
> > &r1=1292763&r2=1292764&view=diff
> > 
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/build.xml (original)
> > +++ nutch/trunk/src/plugin/build.xml Thu Feb 23 12:32:49 2012
> > @@ -57,6 +57,7 @@
> > 
> >      <ant dir="tld" target="deploy"/>
> >      <ant dir="urlfilter-automaton" target="deploy"/>
> >      <ant dir="urlfilter-domain" target="deploy" />
> > 
> > +     <ant dir="urlfilter-domainblacklist" target="deploy" />
> > 
> >      <ant dir="urlfilter-prefix" target="deploy"/>
> >      <ant dir="urlfilter-regex" target="deploy"/>
> >      <ant dir="urlfilter-suffix" target="deploy"/>
> > 
> > @@ -132,6 +133,7 @@
> > 
> >     <ant dir="tld" target="clean"/>
> >     <ant dir="urlfilter-automaton" target="clean"/>
> >     <ant dir="urlfilter-domain" target="clean" />
> > 
> > +    <ant dir="urlfilter-domainblacklist" target="clean" />
> > 
> >     <ant dir="urlfilter-prefix" target="clean"/>
> >     <ant dir="urlfilter-regex" target="clean"/>
> >     <ant dir="urlfilter-suffix" target="clean"/>
> > 
> > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/build.xml?rev=1292764&view=auto
> > 
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> > (added) +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> > Thu Feb 23 12:32:49 2012
> > @@ -0,0 +1,28 @@
> > +<?xml version="1.0"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements.  See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License.  You may obtain a copy of the License at
> > +
> > +     http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied. + See the License for the specific language governing
> > permissions and + limitations under the License.
> > +-->
> > +<project name="urlfilter-domainblacklist" default="jar-core">
> > +
> > +  <import file="../build-plugin.xml"/>
> > +
> > +  <!-- for junit test -->
> > +  <mkdir dir="${build.test}/data"/>
> > +  <copy todir="${build.test}/data">
> > +    <fileset dir="data" />
> > +  </copy>
> > +
> > +</project>
> > 
> > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/data/hosts.txt?rev=1292764&view=auto
> > 
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> > (added) +++
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt Thu Feb
> > 23 12:32:49 2012
> > @@ -0,0 +1,5 @@
> > +# comments start with the pound sign
> > +net
> > +apache.org
> > +be
> > +www.yahoo.com
> > \ No newline at end of file
> > 
> > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/ivy.xml?rev=1292764&view=auto
> > 
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> > (added) +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml Thu
> > Feb 23 12:32:49 2012
> > @@ -0,0 +1,41 @@
> > +<?xml version="1.0" ?>
> > +
> > +<!--
> > +   Licensed to the Apache Software Foundation (ASF) under one or more
> > +   contributor license agreements.  See the NOTICE file distributed with
> > +   this work for additional information regarding copyright ownership.
> > +   The ASF licenses this file to You under the Apache License, Version
> > 2.0 +   (the "License"); you may not use this file except in compliance
> > with +   the License.  You may obtain a copy of the License at
> > +
> > +       http://www.apache.org/licenses/LICENSE-2.0
> > +
> > +   Unless required by applicable law or agreed to in writing, software
> > +   distributed under the License is distributed on an "AS IS" BASIS,
> > +   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > +   See the License for the specific language governing permissions and
> > +   limitations under the License.
> > +-->
> > +
> > +<ivy-module version="1.0">
> > +  <info organisation="org.apache.nutch" module="${ant.project.name}">
> > +    <license name="Apache 2.0"/>
> > +    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
> > +    <description>
> > +        Apache Nutch
> > +    </description>
> > +  </info>
> > +
> > +  <configurations>
> > +    <include file="../../../ivy/ivy-configurations.xml"/>
> > +  </configurations>
> > +
> > +  <publications>
> > +    <!--get the artifact from our module name-->
> > +    <artifact conf="master"/>
> > +  </publications>
> > +
> > +  <dependencies>
> > +  </dependencies>
> > +
> > +</ivy-module>
> > 
> > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/plugin.xml?rev=1292764&view=auto
> > 
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> > (added) +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> > Thu Feb 23 12:32:49 2012
> > @@ -0,0 +1,43 @@
> > +<?xml version="1.0" encoding="UTF-8"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements.  See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License.  You may obtain a copy of the License at
> > +
> > +     http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied. + See the License for the specific language governing
> > permissions and + limitations under the License.
> > +-->
> > +<plugin
> > +   id="urlfilter-domainblacklist"
> > +   name="Domain Blacklist URL Filter"
> > +   version="1.0.0"
> > +   provider-name="nutch.org">
> > +
> > +   <runtime>
> > +      <library name="urlfilter-domainblacklist.jar">
> > +         <export name="*"/>
> > +      </library>
> > +   </runtime>
> > +
> > +   <requires>
> > +      <import plugin="nutch-extensionpoints"/>
> > +   </requires>
> > +
> > +   <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
> > +              name="Nutch Domain Blacklist URL Filter"
> > +              point="org.apache.nutch.net.URLFilter">
> > +      <implementation id="DomainBlacklistURLFilter"
> > +
> > 
> >  class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilt
> >  er">
> > 
> > +        <parameter name="file" value="domainblacklist-urlfilter.txt"/>
> > +      </implementation>
> > +   </extension>
> > +
> > +</plugin>
> > 
> > Added:
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutc
> > h/urlfilter/domainblacklist/DomainBlacklistURLFilter.java URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistU
> > RLFilter.java?rev=1292764&view=auto
> > 
> > =========================================================================
> > ===== ---
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutc
> > h/urlfilter/domainblacklist/DomainBlacklistURLFilter.java (added)
> > +++
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutc
> > h/urlfilter/domainblacklist/DomainBlacklistURLFilter.java Thu Feb 23
> > 12:32:49 2012
> > @@ -0,0 +1,203 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements.  See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> > 2.0 + * (the "License"); you may not use this file except in compliance
> > with + * the License.  You may obtain a copy of the License at
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.nutch.urlfilter.domainblacklist;
> > +
> > +import java.io.BufferedReader;
> > +import java.io.FileReader;
> > +import java.io.IOException;
> > +import java.io.Reader;
> > +import java.io.StringReader;
> > +import java.util.LinkedHashSet;
> > +import java.util.Set;
> > +
> > +import org.apache.commons.lang.StringUtils;
> > +import org.slf4j.Logger;
> > +import org.slf4j.LoggerFactory;
> > +import org.apache.hadoop.conf.Configuration;
> > +import org.apache.nutch.net.URLFilter;
> > +import org.apache.nutch.plugin.Extension;
> > +import org.apache.nutch.plugin.PluginRepository;
> > +import org.apache.nutch.util.URLUtil;
> > +import org.apache.nutch.util.domain.DomainSuffix;
> > +
> > +/**
> > + * <p>Filters URLs based on a file containing domain suffixes, domain
> > names, and
> > + * hostnames. A url that matches one of the suffixes, domains, or hosts
> > + * present in the file is filtered out.</p>
> > + *
> > + * <p>Urls are checked in order of domain suffix, domain name, and
> > hostname
> > + * against entries in the domain file. The domain file would be setup as
> > follows
> > + * with one entry per line:
> > + *
> > + * <pre> com apache.org www.apache.org </pre>
> > + *
> > + * <p>The first line is an example of a filter that would allow all .com
> > + * domains. The second line allows all urls from apache.org and all of
> > its
> > + * subdomains such as lucene.apache.org and hadoop.apache.org. The third
> > line
> > + * would allow only urls from www.apache.org. There is no specific
> > ordering to
> > + * entries. The entries are from more general to more specific with the
> > more
> > + * general overridding the more specific.</p>
> > + *
> > + * The domain file defaults to domainblacklist-urlfilter.txt in the
> > classpath but can be
> > + * overridden using the:
> > + *
> > + * <ul> <ol>property "urlfilter.domainblacklist.file" in
> > ./conf/nutch-*.xml, and</ol>
> > + * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
> > + *
> > + * the attribute "file" has higher precedence if defined.
> > + */
> > +public class DomainBlacklistURLFilter
> > +  implements URLFilter {
> > +
> > +  private static final Logger LOG =
> > LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
> > +
> > +  // read in attribute "file" of this plugin.
> > +  private static String attributeFile = null;
> > +  private Configuration conf;
> > +  private String domainFile = null;
> > +  private Set<String> domainSet = new LinkedHashSet<String>();
> > +
> > +  private void readConfiguration(Reader configReader)
> > +    throws IOException {
> > +
> > +    // read the configuration file, line by line
> > +    BufferedReader reader = new BufferedReader(configReader);
> > +    String line = null;
> > +    while ((line = reader.readLine()) != null) {
> > +      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
> > +        // add non-blank lines and non-commented lines
> > +        domainSet.add(StringUtils.lowerCase(line));
> > +      }
> > +    }
> > +  }
> > +
> > +  /**
> > +   * Default constructor.
> > +   */
> > +  public DomainBlacklistURLFilter() {
> > +
> > +  }
> > +
> > +  /**
> > +   * Constructor that specifies the domain file to use.
> > +   *
> > +   * @param domainFile The domain file, overrides
> > domainblacklist-urlfilter.text default.
> > +   *
> > +   * @throws IOException
> > +   */
> > +  public DomainBlacklistURLFilter(String domainFile) {
> > +    this.domainFile = domainFile;
> > +  }
> > +
> > +  /**
> > +   * Sets the configuration.
> > +   */
> > +  public void setConf(Configuration conf) {
> > +    this.conf = conf;
> > +
> > +    // get the extensions for domain urlfilter
> > +    String pluginName = "urlfilter-domainblacklist";
> > +    Extension[] extensions =
> > PluginRepository.get(conf).getExtensionPoint( +     
> > URLFilter.class.getName()).getExtensions();
> > +    for (int i = 0; i < extensions.length; i++) {
> > +      Extension extension = extensions[i];
> > +      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
> > +        attributeFile = extension.getAttribute("file");
> > +        break;
> > +      }
> > +    }
> > +
> > +    // handle blank non empty input
> > +    if (attributeFile != null && attributeFile.trim().equals("")) {
> > +      attributeFile = null;
> > +    }
> > +
> > +    if (attributeFile != null) {
> > +      if (LOG.isInfoEnabled()) {
> > +        LOG.info("Attribute \"file\" is defined for plugin " +
> > pluginName +          + " as " + attributeFile);
> > +      }
> > +    }
> > +    else {
> > +      if (LOG.isWarnEnabled()) {
> > +        LOG.warn("Attribute \"file\" is not defined in plugin.xml for
> > plugin "
> > +          + pluginName);
> > +      }
> > +    }
> > +
> > +    // domain file and attribute "file" take precedence if defined
> > +    String file = conf.get("urlfilter.domainblacklist.file");
> > +    String stringRules = conf.get("urlfilter.domainblacklist.rules");
> > +    if (domainFile != null) {
> > +      file = domainFile;
> > +    }
> > +    else if (attributeFile != null) {
> > +      file = attributeFile;
> > +    }
> > +    Reader reader = null;
> > +    if (stringRules != null) { // takes precedence over files
> > +      reader = new StringReader(stringRules);
> > +    } else {
> > +      reader = conf.getConfResourceAsReader(file);
> > +    }
> > +    try {
> > +      if (reader == null) {
> > +        reader = new FileReader(file);
> > +      }
> > +      readConfiguration(reader);
> > +    }
> > +    catch (IOException e) {
> > +     
> > LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); +  
> >  }
> > +  }
> > +
> > +  public Configuration getConf() {
> > +    return this.conf;
> > +  }
> > +
> > +  public String filter(String url) {
> > +
> > +    try {
> > +
> > +      // match for suffix, domain, and host in that order.  more general
> > will
> > +      // override more specific
> > +      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
> > +      String host = URLUtil.getHost(url);
> > +      String suffix = null;
> > +      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
> > +      if (domainSuffix != null) {
> > +        suffix = domainSuffix.getDomain();
> > +      }
> > +
> > +      if (domainSet.contains(suffix) || domainSet.contains(domain)
> > +        || domainSet.contains(host)) {
> > +        // Matches, filter!
> > +        return null;
> > +      }
> > +
> > +      // doesn't match, allow
> > +      return url;
> > +    }
> > +    catch (Exception e) {
> > +
> > +      // if an error happens, allow the url to pass
> > +      LOG.error("Could not apply filter on url: " + url + "\n"
> > +        + org.apache.hadoop.util.StringUtils.stringifyException(e));
> > +      return null;
> > +    }
> > +  }
> > +}
> > 
> > Added:
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutc
> > h/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlackl
> > istURLFilter.java?rev=1292764&view=auto
> > 
> > =========================================================================
> > ===== ---
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutc
> > h/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java (added)
> > +++
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutc
> > h/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java Thu Feb 23
> > 12:32:49 2012
> > @@ -0,0 +1,57 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements.  See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> > 2.0 + * (the "License"); you may not use this file except in compliance
> > with + * the License.  You may obtain a copy of the License at
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.nutch.urlfilter.domainblacklist;
> > +
> > +import junit.framework.TestCase;
> > +
> > +import org.slf4j.Logger;
> > +import org.slf4j.LoggerFactory;
> > +import org.apache.hadoop.conf.Configuration;
> > +import org.apache.nutch.util.NutchConfiguration;
> > +
> > +public class TestDomainBlacklistURLFilter
> > +  extends TestCase {
> > +
> > +  protected static final Logger LOG =
> > LoggerFactory.getLogger(TestDomainBlacklistURLFilter.class);
> > +
> > +  private final static String SEPARATOR =
> > System.getProperty("file.separator");
> > +  private final static String SAMPLES = System.getProperty("test.data",
> > ".");
> > +
> > +  public TestDomainBlacklistURLFilter(String testName) {
> > +    super(testName);
> > +  }
> > +
> > +  public void testFilter()
> > +    throws Exception {
> > +
> > +    String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
> > +    Configuration conf = NutchConfiguration.create();
> > +    DomainBlacklistURLFilter domainBlacklistFilter = new
> > DomainBlacklistURLFilter(domainBlacklistFile);
> > +    domainBlacklistFilter.setConf(conf);
> > +   
> > assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); + 
> >   assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
> > +    assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
> > +   
> > assertNotNull(domainBlacklistFilter.filter("http://www.google.com")); + 
> >   assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
> > +    assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
> > +    assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
> > +    assertNull(domainBlacklistFilter.filter("http://www.yahoo.com")); +
> >    assertNull(domainBlacklistFilter.filter("http://www.foobar.be")); +  
> >  assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com")); + 
> > }
> > +
> > +}

-- 
Markus Jelsma - CTO - Openindex