You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@nutch.apache.org by Lewis John Mcgibbney <le...@gmail.com> on 2012/02/23 13:36:35 UTC
Re: svn commit: r1292764 - in /nutch/trunk: ./ conf/ src/plugin/
src/plugin/urlfilter-domainblacklist/ src/plugin/urlfilter-domainblacklist/data/
src/plugin/urlfilter-domainblacklist/src/ src/plugin/urlfilter-domainblacklist/src/java/
src/plugin/urlf
Hey Markus,
Great work with this one.
I notice that you did not add
<ant dir="urlfilter-domainblacklist" target="test" />
to nutch/trunk/src/plugin/build.xml
Lewis
On Thu, Feb 23, 2012 at 12:32 PM, <ma...@apache.org> wrote:
> Author: markus
> Date: Thu Feb 23 12:32:49 2012
> New Revision: 1292764
>
> URL: http://svn.apache.org/viewvc?rev=1292764&view=rev
> Log:
> NUTCH-1210 Domain Blacklist Filter
>
> Added:
> nutch/trunk/conf/domainblacklist-urlfilter.txt
> nutch/trunk/src/plugin/urlfilter-domainblacklist/
> nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> nutch/trunk/src/plugin/urlfilter-domainblacklist/data/
> nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/
>
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/
>
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/
>
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/
>
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/
>
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/
>
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/
>
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/
>
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> Modified:
> nutch/trunk/CHANGES.txt
> nutch/trunk/src/plugin/build.xml
>
> Modified: nutch/trunk/CHANGES.txt
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1292764&r1=1292763&r2=1292764&view=diff
>
> ==============================================================================
> --- nutch/trunk/CHANGES.txt (original)
> +++ nutch/trunk/CHANGES.txt Thu Feb 23 12:32:49 2012
> @@ -1,5 +1,7 @@
> Nutch Change Log
>
> +* NUTCH-1210 DomainBlacklistFilter (markus)
> +
> * NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
>
> * NUTCH-1193 Incorrect url transform to lowercase: parameter solr
> (Eduardo dos Santos Leggiero via lewismc)
>
> Added: nutch/trunk/conf/domainblacklist-urlfilter.txt
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/conf/domainblacklist-urlfilter.txt?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/conf/domainblacklist-urlfilter.txt (added)
> +++ nutch/trunk/conf/domainblacklist-urlfilter.txt Thu Feb 23 12:32:49 2012
> @@ -0,0 +1,16 @@
> +# Licensed to the Apache Software Foundation (ASF) under one or more
> +# contributor license agreements. See the NOTICE file distributed with
> +# this work for additional information regarding copyright ownership.
> +# The ASF licenses this file to You under the Apache License, Version 2.0
> +# (the "License"); you may not use this file except in compliance with
> +# the License. You may obtain a copy of the License at
> +#
> +# http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +
> +# config file for urlfilter-domainblacklist plugin
>
> Modified: nutch/trunk/src/plugin/build.xml
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1292764&r1=1292763&r2=1292764&view=diff
>
> ==============================================================================
> --- nutch/trunk/src/plugin/build.xml (original)
> +++ nutch/trunk/src/plugin/build.xml Thu Feb 23 12:32:49 2012
> @@ -57,6 +57,7 @@
> <ant dir="tld" target="deploy"/>
> <ant dir="urlfilter-automaton" target="deploy"/>
> <ant dir="urlfilter-domain" target="deploy" />
> + <ant dir="urlfilter-domainblacklist" target="deploy" />
> <ant dir="urlfilter-prefix" target="deploy"/>
> <ant dir="urlfilter-regex" target="deploy"/>
> <ant dir="urlfilter-suffix" target="deploy"/>
> @@ -132,6 +133,7 @@
> <ant dir="tld" target="clean"/>
> <ant dir="urlfilter-automaton" target="clean"/>
> <ant dir="urlfilter-domain" target="clean" />
> + <ant dir="urlfilter-domainblacklist" target="clean" />
> <ant dir="urlfilter-prefix" target="clean"/>
> <ant dir="urlfilter-regex" target="clean"/>
> <ant dir="urlfilter-suffix" target="clean"/>
>
> Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml (added)
> +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml Thu Feb 23
> 12:32:49 2012
> @@ -0,0 +1,28 @@
> +<?xml version="1.0"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements. See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License. You may obtain a copy of the License at
> +
> + http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<project name="urlfilter-domainblacklist" default="jar-core">
> +
> + <import file="../build-plugin.xml"/>
> +
> + <!-- for junit test -->
> + <mkdir dir="${build.test}/data"/>
> + <copy todir="${build.test}/data">
> + <fileset dir="data" />
> + </copy>
> +
> +</project>
>
> Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt (added)
> +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt Thu
> Feb 23 12:32:49 2012
> @@ -0,0 +1,5 @@
> +# comments start with the pound sign
> +net
> +apache.org
> +be
> +www.yahoo.com
> \ No newline at end of file
>
> Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml (added)
> +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml Thu Feb 23
> 12:32:49 2012
> @@ -0,0 +1,41 @@
> +<?xml version="1.0" ?>
> +
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements. See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License. You may obtain a copy of the License at
> +
> + http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +
> +<ivy-module version="1.0">
> + <info organisation="org.apache.nutch" module="${ant.project.name}">
> + <license name="Apache 2.0"/>
> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
> + <description>
> + Apache Nutch
> + </description>
> + </info>
> +
> + <configurations>
> + <include file="../../../ivy/ivy-configurations.xml"/>
> + </configurations>
> +
> + <publications>
> + <!--get the artifact from our module name-->
> + <artifact conf="master"/>
> + </publications>
> +
> + <dependencies>
> + </dependencies>
> +
> +</ivy-module>
>
> Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml?rev=1292764&view=auto
>
> ==============================================================================
> --- nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml (added)
> +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml Thu Feb 23
> 12:32:49 2012
> @@ -0,0 +1,43 @@
> +<?xml version="1.0" encoding="UTF-8"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements. See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License. You may obtain a copy of the License at
> +
> + http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<plugin
> + id="urlfilter-domainblacklist"
> + name="Domain Blacklist URL Filter"
> + version="1.0.0"
> + provider-name="nutch.org">
> +
> + <runtime>
> + <library name="urlfilter-domainblacklist.jar">
> + <export name="*"/>
> + </library>
> + </runtime>
> +
> + <requires>
> + <import plugin="nutch-extensionpoints"/>
> + </requires>
> +
> + <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
> + name="Nutch Domain Blacklist URL Filter"
> + point="org.apache.nutch.net.URLFilter">
> + <implementation id="DomainBlacklistURLFilter"
> +
> class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter">
> + <parameter name="file" value="domainblacklist-urlfilter.txt"/>
> + </implementation>
> + </extension>
> +
> +</plugin>
>
> Added:
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java?rev=1292764&view=auto
>
> ==============================================================================
> ---
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> (added)
> +++
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> Thu Feb 23 12:32:49 2012
> @@ -0,0 +1,203 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.nutch.urlfilter.domainblacklist;
> +
> +import java.io.BufferedReader;
> +import java.io.FileReader;
> +import java.io.IOException;
> +import java.io.Reader;
> +import java.io.StringReader;
> +import java.util.LinkedHashSet;
> +import java.util.Set;
> +
> +import org.apache.commons.lang.StringUtils;
> +import org.slf4j.Logger;
> +import org.slf4j.LoggerFactory;
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.nutch.net.URLFilter;
> +import org.apache.nutch.plugin.Extension;
> +import org.apache.nutch.plugin.PluginRepository;
> +import org.apache.nutch.util.URLUtil;
> +import org.apache.nutch.util.domain.DomainSuffix;
> +
> +/**
> + * <p>Filters URLs based on a file containing domain suffixes, domain
> names, and
> + * hostnames. A url that matches one of the suffixes, domains, or hosts
> + * present in the file is filtered out.</p>
> + *
> + * <p>Urls are checked in order of domain suffix, domain name, and
> hostname
> + * against entries in the domain file. The domain file would be setup as
> follows
> + * with one entry per line:
> + *
> + * <pre> com apache.org www.apache.org </pre>
> + *
> + * <p>The first line is an example of a filter that would allow all .com
> + * domains. The second line allows all urls from apache.org and all of
> its
> + * subdomains such as lucene.apache.org and hadoop.apache.org. The third
> line
> + * would allow only urls from www.apache.org. There is no specific
> ordering to
> + * entries. The entries are from more general to more specific with the
> more
> + * general overridding the more specific.</p>
> + *
> + * The domain file defaults to domainblacklist-urlfilter.txt in the
> classpath but can be
> + * overridden using the:
> + *
> + * <ul> <ol>property "urlfilter.domainblacklist.file" in
> ./conf/nutch-*.xml, and</ol>
> + * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
> + *
> + * the attribute "file" has higher precedence if defined.
> + */
> +public class DomainBlacklistURLFilter
> + implements URLFilter {
> +
> + private static final Logger LOG =
> LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
> +
> + // read in attribute "file" of this plugin.
> + private static String attributeFile = null;
> + private Configuration conf;
> + private String domainFile = null;
> + private Set<String> domainSet = new LinkedHashSet<String>();
> +
> + private void readConfiguration(Reader configReader)
> + throws IOException {
> +
> + // read the configuration file, line by line
> + BufferedReader reader = new BufferedReader(configReader);
> + String line = null;
> + while ((line = reader.readLine()) != null) {
> + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
> + // add non-blank lines and non-commented lines
> + domainSet.add(StringUtils.lowerCase(line));
> + }
> + }
> + }
> +
> + /**
> + * Default constructor.
> + */
> + public DomainBlacklistURLFilter() {
> +
> + }
> +
> + /**
> + * Constructor that specifies the domain file to use.
> + *
> + * @param domainFile The domain file, overrides
> domainblacklist-urlfilter.text default.
> + *
> + * @throws IOException
> + */
> + public DomainBlacklistURLFilter(String domainFile) {
> + this.domainFile = domainFile;
> + }
> +
> + /**
> + * Sets the configuration.
> + */
> + public void setConf(Configuration conf) {
> + this.conf = conf;
> +
> + // get the extensions for domain urlfilter
> + String pluginName = "urlfilter-domainblacklist";
> + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
> + URLFilter.class.getName()).getExtensions();
> + for (int i = 0; i < extensions.length; i++) {
> + Extension extension = extensions[i];
> + if (extension.getDescriptor().getPluginId().equals(pluginName)) {
> + attributeFile = extension.getAttribute("file");
> + break;
> + }
> + }
> +
> + // handle blank non empty input
> + if (attributeFile != null && attributeFile.trim().equals("")) {
> + attributeFile = null;
> + }
> +
> + if (attributeFile != null) {
> + if (LOG.isInfoEnabled()) {
> + LOG.info("Attribute \"file\" is defined for plugin " + pluginName
> + + " as " + attributeFile);
> + }
> + }
> + else {
> + if (LOG.isWarnEnabled()) {
> + LOG.warn("Attribute \"file\" is not defined in plugin.xml for
> plugin "
> + + pluginName);
> + }
> + }
> +
> + // domain file and attribute "file" take precedence if defined
> + String file = conf.get("urlfilter.domainblacklist.file");
> + String stringRules = conf.get("urlfilter.domainblacklist.rules");
> + if (domainFile != null) {
> + file = domainFile;
> + }
> + else if (attributeFile != null) {
> + file = attributeFile;
> + }
> + Reader reader = null;
> + if (stringRules != null) { // takes precedence over files
> + reader = new StringReader(stringRules);
> + } else {
> + reader = conf.getConfResourceAsReader(file);
> + }
> + try {
> + if (reader == null) {
> + reader = new FileReader(file);
> + }
> + readConfiguration(reader);
> + }
> + catch (IOException e) {
> + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
> + }
> + }
> +
> + public Configuration getConf() {
> + return this.conf;
> + }
> +
> + public String filter(String url) {
> +
> + try {
> +
> + // match for suffix, domain, and host in that order. more general
> will
> + // override more specific
> + String domain = URLUtil.getDomainName(url).toLowerCase().trim();
> + String host = URLUtil.getHost(url);
> + String suffix = null;
> + DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
> + if (domainSuffix != null) {
> + suffix = domainSuffix.getDomain();
> + }
> +
> + if (domainSet.contains(suffix) || domainSet.contains(domain)
> + || domainSet.contains(host)) {
> + // Matches, filter!
> + return null;
> + }
> +
> + // doesn't match, allow
> + return url;
> + }
> + catch (Exception e) {
> +
> + // if an error happens, allow the url to pass
> + LOG.error("Could not apply filter on url: " + url + "\n"
> + + org.apache.hadoop.util.StringUtils.stringifyException(e));
> + return null;
> + }
> + }
> +}
>
> Added:
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java?rev=1292764&view=auto
>
> ==============================================================================
> ---
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> (added)
> +++
> nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> Thu Feb 23 12:32:49 2012
> @@ -0,0 +1,57 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.nutch.urlfilter.domainblacklist;
> +
> +import junit.framework.TestCase;
> +
> +import org.slf4j.Logger;
> +import org.slf4j.LoggerFactory;
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.nutch.util.NutchConfiguration;
> +
> +public class TestDomainBlacklistURLFilter
> + extends TestCase {
> +
> + protected static final Logger LOG =
> LoggerFactory.getLogger(TestDomainBlacklistURLFilter.class);
> +
> + private final static String SEPARATOR =
> System.getProperty("file.separator");
> + private final static String SAMPLES = System.getProperty("test.data",
> ".");
> +
> + public TestDomainBlacklistURLFilter(String testName) {
> + super(testName);
> + }
> +
> + public void testFilter()
> + throws Exception {
> +
> + String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
> + Configuration conf = NutchConfiguration.create();
> + DomainBlacklistURLFilter domainBlacklistFilter = new
> DomainBlacklistURLFilter(domainBlacklistFile);
> + domainBlacklistFilter.setConf(conf);
> + assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
> + assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
> + assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
> + assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
> + assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
> + assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
> + assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
> + assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
> + assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
> + assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
> + }
> +
> +}
>
>
>
--
*Lewis*
Re: svn commit: r1292764 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/urlfilter-domainblacklist/ src/plugin/urlfilter-domainblacklist/data/ src/plugin/urlfilter-domainblacklist/src/ src/plugin/urlfilter-domainblacklist/src/java/ src/plugin/urlf
Posted by Markus Jelsma <ma...@openindex.io>.
I didn't? I explicitly added it. I'll check again and commit if i have to.
Thanks
On Thursday 23 February 2012 13:36:35 Lewis John Mcgibbney wrote:
> Hey Markus,
>
> Great work with this one.
>
> I notice that you did not add
>
> <ant dir="urlfilter-domainblacklist" target="test" />
>
> to nutch/trunk/src/plugin/build.xml
>
> Lewis
>
> On Thu, Feb 23, 2012 at 12:32 PM, <ma...@apache.org> wrote:
> > Author: markus
> > Date: Thu Feb 23 12:32:49 2012
> > New Revision: 1292764
> >
> > URL: http://svn.apache.org/viewvc?rev=1292764&view=rev
> > Log:
> > NUTCH-1210 Domain Blacklist Filter
> >
> > Added:
> > nutch/trunk/conf/domainblacklist-urlfilter.txt
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/data/
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nut
> > ch/
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nut
> > ch/urlfilter/
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nut
> > ch/urlfilter/domainblacklist/
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nut
> > ch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nut
> > ch/
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nut
> > ch/urlfilter/
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nut
> > ch/urlfilter/domainblacklist/
> >
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nut
> > ch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
> >
> > Modified:
> > nutch/trunk/CHANGES.txt
> > nutch/trunk/src/plugin/build.xml
> >
> > Modified: nutch/trunk/CHANGES.txt
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1292764&r1=12927
> > 63&r2=1292764&view=diff
> >
> > =========================================================================
> > ===== --- nutch/trunk/CHANGES.txt (original)
> > +++ nutch/trunk/CHANGES.txt Thu Feb 23 12:32:49 2012
> > @@ -1,5 +1,7 @@
> >
> > Nutch Change Log
> >
> > +* NUTCH-1210 DomainBlacklistFilter (markus)
> > +
> >
> > * NUTCH-965 Skip parsing for truncated documents (alexis, lewismc,
> > ferdy)
> >
> > * NUTCH-1193 Incorrect url transform to lowercase: parameter solr
> >
> > (Eduardo dos Santos Leggiero via lewismc)
> >
> > Added: nutch/trunk/conf/domainblacklist-urlfilter.txt
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/conf/domainblacklist-urlfilter.t
> > xt?rev=1292764&view=auto
> >
> > =========================================================================
> > ===== --- nutch/trunk/conf/domainblacklist-urlfilter.txt (added)
> > +++ nutch/trunk/conf/domainblacklist-urlfilter.txt Thu Feb 23 12:32:49
> > 2012 @@ -0,0 +1,16 @@
> > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > +# contributor license agreements. See the NOTICE file distributed with
> > +# this work for additional information regarding copyright ownership.
> > +# The ASF licenses this file to You under the Apache License, Version
> > 2.0 +# (the "License"); you may not use this file except in compliance
> > with +# the License. You may obtain a copy of the License at
> > +#
> > +# http://www.apache.org/licenses/LICENSE-2.0
> > +#
> > +# Unless required by applicable law or agreed to in writing, software
> > +# distributed under the License is distributed on an "AS IS" BASIS,
> > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied. +# See the License for the specific language governing
> > permissions and +# limitations under the License.
> > +
> > +# config file for urlfilter-domainblacklist plugin
> >
> > Modified: nutch/trunk/src/plugin/build.xml
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1292764
> > &r1=1292763&r2=1292764&view=diff
> >
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/build.xml (original)
> > +++ nutch/trunk/src/plugin/build.xml Thu Feb 23 12:32:49 2012
> > @@ -57,6 +57,7 @@
> >
> > <ant dir="tld" target="deploy"/>
> > <ant dir="urlfilter-automaton" target="deploy"/>
> > <ant dir="urlfilter-domain" target="deploy" />
> >
> > + <ant dir="urlfilter-domainblacklist" target="deploy" />
> >
> > <ant dir="urlfilter-prefix" target="deploy"/>
> > <ant dir="urlfilter-regex" target="deploy"/>
> > <ant dir="urlfilter-suffix" target="deploy"/>
> >
> > @@ -132,6 +133,7 @@
> >
> > <ant dir="tld" target="clean"/>
> > <ant dir="urlfilter-automaton" target="clean"/>
> > <ant dir="urlfilter-domain" target="clean" />
> >
> > + <ant dir="urlfilter-domainblacklist" target="clean" />
> >
> > <ant dir="urlfilter-prefix" target="clean"/>
> > <ant dir="urlfilter-regex" target="clean"/>
> > <ant dir="urlfilter-suffix" target="clean"/>
> >
> > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/build.xml?rev=1292764&view=auto
> >
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> > (added) +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml
> > Thu Feb 23 12:32:49 2012
> > @@ -0,0 +1,28 @@
> > +<?xml version="1.0"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements. See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License. You may obtain a copy of the License at
> > +
> > + http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied. + See the License for the specific language governing
> > permissions and + limitations under the License.
> > +-->
> > +<project name="urlfilter-domainblacklist" default="jar-core">
> > +
> > + <import file="../build-plugin.xml"/>
> > +
> > + <!-- for junit test -->
> > + <mkdir dir="${build.test}/data"/>
> > + <copy todir="${build.test}/data">
> > + <fileset dir="data" />
> > + </copy>
> > +
> > +</project>
> >
> > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/data/hosts.txt?rev=1292764&view=auto
> >
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt
> > (added) +++
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt Thu Feb
> > 23 12:32:49 2012
> > @@ -0,0 +1,5 @@
> > +# comments start with the pound sign
> > +net
> > +apache.org
> > +be
> > +www.yahoo.com
> > \ No newline at end of file
> >
> > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/ivy.xml?rev=1292764&view=auto
> >
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml
> > (added) +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml Thu
> > Feb 23 12:32:49 2012
> > @@ -0,0 +1,41 @@
> > +<?xml version="1.0" ?>
> > +
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements. See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version
> > 2.0 + (the "License"); you may not use this file except in compliance
> > with + the License. You may obtain a copy of the License at
> > +
> > + http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + See the License for the specific language governing permissions and
> > + limitations under the License.
> > +-->
> > +
> > +<ivy-module version="1.0">
> > + <info organisation="org.apache.nutch" module="${ant.project.name}">
> > + <license name="Apache 2.0"/>
> > + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
> > + <description>
> > + Apache Nutch
> > + </description>
> > + </info>
> > +
> > + <configurations>
> > + <include file="../../../ivy/ivy-configurations.xml"/>
> > + </configurations>
> > +
> > + <publications>
> > + <!--get the artifact from our module name-->
> > + <artifact conf="master"/>
> > + </publications>
> > +
> > + <dependencies>
> > + </dependencies>
> > +
> > +</ivy-module>
> >
> > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> > URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/plugin.xml?rev=1292764&view=auto
> >
> > =========================================================================
> > ===== --- nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> > (added) +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml
> > Thu Feb 23 12:32:49 2012
> > @@ -0,0 +1,43 @@
> > +<?xml version="1.0" encoding="UTF-8"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements. See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License. You may obtain a copy of the License at
> > +
> > + http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied. + See the License for the specific language governing
> > permissions and + limitations under the License.
> > +-->
> > +<plugin
> > + id="urlfilter-domainblacklist"
> > + name="Domain Blacklist URL Filter"
> > + version="1.0.0"
> > + provider-name="nutch.org">
> > +
> > + <runtime>
> > + <library name="urlfilter-domainblacklist.jar">
> > + <export name="*"/>
> > + </library>
> > + </runtime>
> > +
> > + <requires>
> > + <import plugin="nutch-extensionpoints"/>
> > + </requires>
> > +
> > + <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
> > + name="Nutch Domain Blacklist URL Filter"
> > + point="org.apache.nutch.net.URLFilter">
> > + <implementation id="DomainBlacklistURLFilter"
> > +
> >
> > class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilt
> > er">
> >
> > + <parameter name="file" value="domainblacklist-urlfilter.txt"/>
> > + </implementation>
> > + </extension>
> > +
> > +</plugin>
> >
> > Added:
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutc
> > h/urlfilter/domainblacklist/DomainBlacklistURLFilter.java URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistU
> > RLFilter.java?rev=1292764&view=auto
> >
> > =========================================================================
> > ===== ---
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutc
> > h/urlfilter/domainblacklist/DomainBlacklistURLFilter.java (added)
> > +++
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutc
> > h/urlfilter/domainblacklist/DomainBlacklistURLFilter.java Thu Feb 23
> > 12:32:49 2012
> > @@ -0,0 +1,203 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> > 2.0 + * (the "License"); you may not use this file except in compliance
> > with + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.nutch.urlfilter.domainblacklist;
> > +
> > +import java.io.BufferedReader;
> > +import java.io.FileReader;
> > +import java.io.IOException;
> > +import java.io.Reader;
> > +import java.io.StringReader;
> > +import java.util.LinkedHashSet;
> > +import java.util.Set;
> > +
> > +import org.apache.commons.lang.StringUtils;
> > +import org.slf4j.Logger;
> > +import org.slf4j.LoggerFactory;
> > +import org.apache.hadoop.conf.Configuration;
> > +import org.apache.nutch.net.URLFilter;
> > +import org.apache.nutch.plugin.Extension;
> > +import org.apache.nutch.plugin.PluginRepository;
> > +import org.apache.nutch.util.URLUtil;
> > +import org.apache.nutch.util.domain.DomainSuffix;
> > +
> > +/**
> > + * <p>Filters URLs based on a file containing domain suffixes, domain
> > names, and
> > + * hostnames. A url that matches one of the suffixes, domains, or hosts
> > + * present in the file is filtered out.</p>
> > + *
> > + * <p>Urls are checked in order of domain suffix, domain name, and
> > hostname
> > + * against entries in the domain file. The domain file would be setup as
> > follows
> > + * with one entry per line:
> > + *
> > + * <pre> com apache.org www.apache.org </pre>
> > + *
> > + * <p>The first line is an example of a filter that would allow all .com
> > + * domains. The second line allows all urls from apache.org and all of
> > its
> > + * subdomains such as lucene.apache.org and hadoop.apache.org. The third
> > line
> > + * would allow only urls from www.apache.org. There is no specific
> > ordering to
> > + * entries. The entries are from more general to more specific with the
> > more
> > + * general overridding the more specific.</p>
> > + *
> > + * The domain file defaults to domainblacklist-urlfilter.txt in the
> > classpath but can be
> > + * overridden using the:
> > + *
> > + * <ul> <ol>property "urlfilter.domainblacklist.file" in
> > ./conf/nutch-*.xml, and</ol>
> > + * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
> > + *
> > + * the attribute "file" has higher precedence if defined.
> > + */
> > +public class DomainBlacklistURLFilter
> > + implements URLFilter {
> > +
> > + private static final Logger LOG =
> > LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
> > +
> > + // read in attribute "file" of this plugin.
> > + private static String attributeFile = null;
> > + private Configuration conf;
> > + private String domainFile = null;
> > + private Set<String> domainSet = new LinkedHashSet<String>();
> > +
> > + private void readConfiguration(Reader configReader)
> > + throws IOException {
> > +
> > + // read the configuration file, line by line
> > + BufferedReader reader = new BufferedReader(configReader);
> > + String line = null;
> > + while ((line = reader.readLine()) != null) {
> > + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
> > + // add non-blank lines and non-commented lines
> > + domainSet.add(StringUtils.lowerCase(line));
> > + }
> > + }
> > + }
> > +
> > + /**
> > + * Default constructor.
> > + */
> > + public DomainBlacklistURLFilter() {
> > +
> > + }
> > +
> > + /**
> > + * Constructor that specifies the domain file to use.
> > + *
> > + * @param domainFile The domain file, overrides
> > domainblacklist-urlfilter.text default.
> > + *
> > + * @throws IOException
> > + */
> > + public DomainBlacklistURLFilter(String domainFile) {
> > + this.domainFile = domainFile;
> > + }
> > +
> > + /**
> > + * Sets the configuration.
> > + */
> > + public void setConf(Configuration conf) {
> > + this.conf = conf;
> > +
> > + // get the extensions for domain urlfilter
> > + String pluginName = "urlfilter-domainblacklist";
> > + Extension[] extensions =
> > PluginRepository.get(conf).getExtensionPoint( +
> > URLFilter.class.getName()).getExtensions();
> > + for (int i = 0; i < extensions.length; i++) {
> > + Extension extension = extensions[i];
> > + if (extension.getDescriptor().getPluginId().equals(pluginName)) {
> > + attributeFile = extension.getAttribute("file");
> > + break;
> > + }
> > + }
> > +
> > + // handle blank non empty input
> > + if (attributeFile != null && attributeFile.trim().equals("")) {
> > + attributeFile = null;
> > + }
> > +
> > + if (attributeFile != null) {
> > + if (LOG.isInfoEnabled()) {
> > + LOG.info("Attribute \"file\" is defined for plugin " +
> > pluginName + + " as " + attributeFile);
> > + }
> > + }
> > + else {
> > + if (LOG.isWarnEnabled()) {
> > + LOG.warn("Attribute \"file\" is not defined in plugin.xml for
> > plugin "
> > + + pluginName);
> > + }
> > + }
> > +
> > + // domain file and attribute "file" take precedence if defined
> > + String file = conf.get("urlfilter.domainblacklist.file");
> > + String stringRules = conf.get("urlfilter.domainblacklist.rules");
> > + if (domainFile != null) {
> > + file = domainFile;
> > + }
> > + else if (attributeFile != null) {
> > + file = attributeFile;
> > + }
> > + Reader reader = null;
> > + if (stringRules != null) { // takes precedence over files
> > + reader = new StringReader(stringRules);
> > + } else {
> > + reader = conf.getConfResourceAsReader(file);
> > + }
> > + try {
> > + if (reader == null) {
> > + reader = new FileReader(file);
> > + }
> > + readConfiguration(reader);
> > + }
> > + catch (IOException e) {
> > +
> > LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); +
> > }
> > + }
> > +
> > + public Configuration getConf() {
> > + return this.conf;
> > + }
> > +
> > + public String filter(String url) {
> > +
> > + try {
> > +
> > + // match for suffix, domain, and host in that order. more general
> > will
> > + // override more specific
> > + String domain = URLUtil.getDomainName(url).toLowerCase().trim();
> > + String host = URLUtil.getHost(url);
> > + String suffix = null;
> > + DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
> > + if (domainSuffix != null) {
> > + suffix = domainSuffix.getDomain();
> > + }
> > +
> > + if (domainSet.contains(suffix) || domainSet.contains(domain)
> > + || domainSet.contains(host)) {
> > + // Matches, filter!
> > + return null;
> > + }
> > +
> > + // doesn't match, allow
> > + return url;
> > + }
> > + catch (Exception e) {
> > +
> > + // if an error happens, allow the url to pass
> > + LOG.error("Could not apply filter on url: " + url + "\n"
> > + + org.apache.hadoop.util.StringUtils.stringifyException(e));
> > + return null;
> > + }
> > + }
> > +}
> >
> > Added:
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutc
> > h/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java URL:
> > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblack
> > list/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlackl
> > istURLFilter.java?rev=1292764&view=auto
> >
> > =========================================================================
> > ===== ---
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutc
> > h/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java (added)
> > +++
> > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutc
> > h/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java Thu Feb 23
> > 12:32:49 2012
> > @@ -0,0 +1,57 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements. See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> > 2.0 + * (the "License"); you may not use this file except in compliance
> > with + * the License. You may obtain a copy of the License at
> > + *
> > + * http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.nutch.urlfilter.domainblacklist;
> > +
> > +import junit.framework.TestCase;
> > +
> > +import org.slf4j.Logger;
> > +import org.slf4j.LoggerFactory;
> > +import org.apache.hadoop.conf.Configuration;
> > +import org.apache.nutch.util.NutchConfiguration;
> > +
> > +public class TestDomainBlacklistURLFilter
> > + extends TestCase {
> > +
> > + protected static final Logger LOG =
> > LoggerFactory.getLogger(TestDomainBlacklistURLFilter.class);
> > +
> > + private final static String SEPARATOR =
> > System.getProperty("file.separator");
> > + private final static String SAMPLES = System.getProperty("test.data",
> > ".");
> > +
> > + public TestDomainBlacklistURLFilter(String testName) {
> > + super(testName);
> > + }
> > +
> > + public void testFilter()
> > + throws Exception {
> > +
> > + String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
> > + Configuration conf = NutchConfiguration.create();
> > + DomainBlacklistURLFilter domainBlacklistFilter = new
> > DomainBlacklistURLFilter(domainBlacklistFile);
> > + domainBlacklistFilter.setConf(conf);
> > +
> > assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); +
> > assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
> > + assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
> > +
> > assertNotNull(domainBlacklistFilter.filter("http://www.google.com")); +
> > assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
> > + assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
> > + assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
> > + assertNull(domainBlacklistFilter.filter("http://www.yahoo.com")); +
> > assertNull(domainBlacklistFilter.filter("http://www.foobar.be")); +
> > assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com")); +
> > }
> > +
> > +}
--
Markus Jelsma - CTO - Openindex