You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:18 UTC
[02/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
new file mode 100644
index 0000000..0be1e31
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domain;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestDomainURLFilter {
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ @Test
+ public void testFilter() throws Exception {
+
+ String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
+ Configuration conf = NutchConfiguration.create();
+ DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+ domainFilter.setConf(conf);
+ Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
+ Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
+ Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
+ Assert.assertNull(domainFilter.filter("http://www.google.com"));
+ Assert.assertNull(domainFilter.filter("http://mail.yahoo.com"));
+ Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
+ Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
+ Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
+ Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
+ Assert.assertNull(domainFilter.filter("http://www.adobe.com"));
+ }
+
+ @Test
+ public void testNoFilter() throws Exception {
+ // https://issues.apache.org/jira/browse/NUTCH-2189
+ String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
+ Configuration conf = NutchConfiguration.create();
+ DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+ domainFilter.setConf(conf);
+ Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
+ Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
+ Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
+ Assert.assertNotNull(domainFilter.filter("http://www.google.com"));
+ Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com"));
+ Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
+ Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
+ Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
+ Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
+ Assert.assertNotNull(domainFilter.filter("http://www.adobe.com"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt b/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt
new file mode 100644
index 0000000..2b88c3b
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/build.xml b/nutch-plugins/urlfilter-domainblacklist/build.xml
new file mode 100644
index 0000000..19ea483
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domainblacklist" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="data" />
+ </copy>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/ivy.xml b/nutch-plugins/urlfilter-domainblacklist/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/plugin.xml b/nutch-plugins/urlfilter-domainblacklist/plugin.xml
new file mode 100644
index 0000000..04eee6e
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-domainblacklist"
+ name="Domain Blacklist URL Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-domainblacklist.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
+ name="Nutch Domain Blacklist URL Filter"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="DomainBlacklistURLFilter"
+ class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter">
+ <parameter name="file" value="domainblacklist-urlfilter.txt"/>
+ </implementation>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/pom.xml b/nutch-plugins/urlfilter-domainblacklist/pom.xml
new file mode 100644
index 0000000..a814579
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlfilter-domainblacklist</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlfilter-domainblacklist</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
new file mode 100644
index 0000000..37b1cdc
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. A url that matches one of the suffixes, domains, or hosts present
+ * in the file is filtered out.
+ * </p>
+ *
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ *
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ *
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ *
+ * The domain file defaults to domainblacklist-urlfilter.txt in the classpath
+ * but can be overridden using the:
+ *
+ * <ul>
+ * <ol>
+ * property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
+ *
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainBlacklistURLFilter implements URLFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainBlacklistURLFilter.class);
+
+ // read in attribute "file" of this plugin.
+ private static String attributeFile = null;
+ private Configuration conf;
+ private String domainFile = null;
+ private Set<String> domainSet = new LinkedHashSet<String>();
+
+ private void readConfiguration(Reader configReader) throws IOException {
+
+ // read the configuration file, line by line
+ BufferedReader reader = new BufferedReader(configReader);
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ // add non-blank lines and non-commented lines
+ domainSet.add(StringUtils.lowerCase(line.trim()));
+ }
+ }
+ }
+
+ /**
+ * Default constructor.
+ */
+ public DomainBlacklistURLFilter() {
+
+ }
+
+ /**
+ * Constructor that specifies the domain file to use.
+ *
+ * @param domainFile
+ * The domain file, overrides domainblacklist-urlfilter.text default.
+ *
+ * @throws IOException
+ */
+ public DomainBlacklistURLFilter(String domainFile) {
+ this.domainFile = domainFile;
+ }
+
+ /**
+ * Sets the configuration.
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // get the extensions for domain urlfilter
+ String pluginName = "urlfilter-domainblacklist";
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+
+ // handle blank non empty input
+ if (attributeFile != null && attributeFile.trim().equals("")) {
+ attributeFile = null;
+ }
+
+ if (attributeFile != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ }
+ } else {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ + pluginName);
+ }
+ }
+
+ // domain file and attribute "file" take precedence if defined
+ String file = conf.get("urlfilter.domainblacklist.file");
+ String stringRules = conf.get("urlfilter.domainblacklist.rules");
+ if (domainFile != null) {
+ file = domainFile;
+ } else if (attributeFile != null) {
+ file = attributeFile;
+ }
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
+ try {
+ if (reader == null) {
+ reader = new FileReader(file);
+ }
+ readConfiguration(reader);
+ } catch (IOException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public String filter(String url) {
+ try {
+ // match for suffix, domain, and host in that order. more general will
+ // override more specific
+ String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+ String host = URLUtil.getHost(url);
+ String suffix = null;
+ DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+ if (domainSuffix != null) {
+ suffix = domainSuffix.getDomain();
+ }
+
+ if (domainSet.contains(suffix) || domainSet.contains(domain)
+ || domainSet.contains(host)) {
+ // Matches, filter!
+ return null;
+ }
+
+ // doesn't match, allow
+ return url;
+ } catch (Exception e) {
+
+ // if an error happens, allow the url to pass
+ LOG.error("Could not apply filter on url: " + url + "\n"
+ + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
new file mode 100644
index 0000000..1f0022c
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin to exclude URLs by domain suffixes, domain names, and/or host names.
+ * See {@link org.apache.nutch.urlfilter.domain} for the counterpart (include only URLs
+ * matching host or domain).
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
new file mode 100644
index 0000000..d253867
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestDomainBlacklistURLFilter {
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ @Test
+ public void testFilter() throws Exception {
+
+ String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
+ Configuration conf = NutchConfiguration.create();
+ DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(
+ domainBlacklistFile);
+ domainBlacklistFilter.setConf(conf);
+ Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
+ Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
+ Assert.assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
+ Assert.assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
+ Assert.assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
+ Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
+ Assert.assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
+ Assert.assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
+ Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
+ Assert.assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt b/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt
new file mode 100644
index 0000000..2b88c3b
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/README.md
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/README.md b/nutch-plugins/urlfilter-ignoreexempt/README.md
new file mode 100644
index 0000000..d48b672
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/README.md
@@ -0,0 +1,43 @@
+urlfilter-ignoreexempt
+======================
+ This plugin allows certain urls to be exempted when the external links are configured to be ignored.
+ This is useful when focused crawl is setup but some resources like static files are linked from CDNs (external domains).
+
+# How to enable ?
+Add `urlfilter-ignoreexempt` value to `plugin.includes` property
+```xml
+<property>
+ <name>plugin.includes</name>
+ <value>protocol-http|urlfilter-(regex|ignoreexempt)...</value>
+</property>
+```
+
+# How to configure rules?
+
+open `conf/db-ignore-external-exemptions.txt` and add the regex rules.
+
+## Format :
+
+The format is same same as `regex-urlfilter.txt`.
+ Each non-comment, non-blank line contains a regular expression
+ prefixed by '+' or '-'. The first matching pattern in the file
+ determines whether a URL is exempted or ignored. If no pattern
+ matches, the URL is ignored.
+
+
+## Example :
+
+ To exempt urls ending with image extensions, use this rule
+
+`+(?i)\.(jpg|png|gif)$`
+
+
+
+## Testing the Rules :
+
+After enabling the plugin and adding your rules to `conf/db-ignore-external-exemptions.txt`, run:
+
+`bin/nutch plugin urlfilter-ignoreexempt org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here`
+
+
+This should print `true` for urls which are accepted by configured rules.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/build.xml b/nutch-plugins/urlfilter-ignoreexempt/build.xml
new file mode 100644
index 0000000..105f551
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-ignoreexempt" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-regex-filter/*.jar" />
+ <include name="**/urlfilter-regex/*.jar" />
+ </fileset>
+ <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+ <pathelement location="${nutch.root}/build/urlfilter-regex/test"/>
+ </path>
+
+ <!-- Compile test classes for dependencies -->
+ <target name="deps-test-compile">
+ <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+ <ant target="compile-test" inheritall="false" dir="../urlfilter-regex"/>
+ </target>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+ <ant target="deploy" inheritall="false" dir="../urlfilter-regex"/>
+ </target>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="data" />
+ </copy>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/ivy.xml b/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/plugin.xml b/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
new file mode 100644
index 0000000..4139ca4
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-ignoreexempt"
+ name="External Domain Ignore Exemption"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-ignoreexempt.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-regex-filter"/>
+ <import plugin="urlfilter-regex"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.ignoreexempt"
+ name="Ignore Exemption Url Filter"
+ point="org.apache.nutch.net.URLExemptionFilter">
+ <implementation id="ExemptionUrlFilter"
+ class="org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter">
+ <parameter name="file" value="db-ignore-external-exemptions.txt"/>
+ </implementation>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/pom.xml b/nutch-plugins/urlfilter-ignoreexempt/pom.xml
new file mode 100644
index 0000000..fd26587
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/pom.xml
@@ -0,0 +1,45 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlfilter-ignoreexempt</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlfilter-ignoreexempt</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>urlfilter-regex</artifactId>
+ <version>${project.parent.version}</version>
+ </dependency>
+ </dependencies>
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
new file mode 100644
index 0000000..bbac300
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLExemptionFilter;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.urlfilter.regex.RegexURLFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.regex.Pattern;
+import java.util.List;
+import java.util.ArrayList;
+
+
+/**
+ * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration
+ * to check if URL is eligible for exemption from 'db.ignore.external'.
+ * When this filter is enabled, the external urls will be checked against configured sequence of regex rules.
+ *<p>
+ * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be
+ * overridden using the property <code>"db.ignore.external.exemptions.file" in ./conf/nutch-*.xml</code>
+ *</p>
+ *
+ * The exemption rules are specified in plain text file where each line is a rule.
+ * The format is same same as `regex-urlfilter.txt`.
+ * Each non-comment, non-blank line contains a regular expression
+ * prefixed by '+' or '-'. The first matching pattern in the file
+ * determines whether a URL is exempted or ignored. If no pattern
+ * matches, the URL is ignored.
+ *
+ * @since Feb 10, 2016
+ * @version 1
+ * @see org.apache.nutch.net.URLExemptionFilter
+ * @see org.apache.nutch.urlfilter.regex.RegexURLFilter
+ */
+public class ExemptionUrlFilter extends RegexURLFilter
+ implements URLExemptionFilter {
+
+ public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE
+ = "db.ignore.external.exemptions.file";
+ private static final Logger LOG =
+ LoggerFactory.getLogger(ExemptionUrlFilter.class);
+
+ private List<Pattern> exemptions;
+ private Configuration conf;
+
+ public List<Pattern> getExemptions() {
+ return exemptions;
+ }
+
+ @Override
+ public boolean filter(String fromUrl, String toUrl) {
+ //this implementation does not consider fromUrl param.
+ //the regex rules are applied to toUrl.
+ return this.filter(toUrl) != null;
+ }
+
+ /**
+ * Gets reader for regex rules
+ */
+ protected Reader getRulesReader(Configuration conf)
+ throws IOException {
+ String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
+ return conf.getConfResourceAsReader(fileRules);
+ }
+
+ public static void main(String[] args) {
+
+ if (args.length != 1) {
+ System.out.println("Error: Invalid Args");
+ System.out.println("Usage: " +
+ ExemptionUrlFilter.class.getName() + " <url>");
+ return;
+ }
+ String url = args[0];
+ ExemptionUrlFilter instance = new ExemptionUrlFilter();
+ instance.setConf(NutchConfiguration.create());
+ System.out.println(instance.filter(null, url));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
new file mode 100644
index 0000000..ee949c5
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin which identifies exemptions to external urls when
+ * when external urls are set to ignore.
+ *
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/build.xml b/nutch-plugins/urlfilter-prefix/build.xml
new file mode 100644
index 0000000..33faa48
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-prefix" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/ivy.xml b/nutch-plugins/urlfilter-prefix/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/plugin.xml b/nutch-plugins/urlfilter-prefix/plugin.xml
new file mode 100644
index 0000000..22cfcaf
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-prefix"
+ name="Prefix URL Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-prefix.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.prefix"
+ name="Nutch Prefix URL Filter"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="PrefixURLFilter"
+ class="org.apache.nutch.urlfilter.prefix.PrefixURLFilter"/>
+ <!-- by default, attribute "file" is undefined, to keep classic behavior.
+ <implementation id="PrefixURLFilter"
+ class="org.apache.nutch.net.PrefixURLFilter">
+ <parameter name="file" value="urlfilter-prefix.txt"/>
+ </implementation>
+ -->
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/pom.xml b/nutch-plugins/urlfilter-prefix/pom.xml
new file mode 100644
index 0000000..65ad019
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlfilter-prefix</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlfilter-prefix</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
new file mode 100644
index 0000000..2e955b5
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.prefix;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.*;
+
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.StringReader;
+
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * Filters URLs based on a file of URL prefixes. The file is named by (1)
+ * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2)
+ * attribute "file" in plugin.xml of this plugin Attribute "file" has higher
+ * precedence if defined.
+ *
+ * <p>
+ * The format of this file is one URL prefix per line.
+ * </p>
+ */
+public class PrefixURLFilter implements URLFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(PrefixURLFilter.class);
+
+ // read in attribute "file" of this plugin.
+ private static String attributeFile = null;
+
+ private TrieStringMatcher trie;
+
+ private Configuration conf;
+
+ public PrefixURLFilter() throws IOException {
+
+ }
+
+ public PrefixURLFilter(String stringRules) throws IOException {
+ trie = readConfiguration(new StringReader(stringRules));
+ }
+
+ public String filter(String url) {
+ if (trie.shortestMatch(url) == null)
+ return null;
+ else
+ return url;
+ }
+
+ private TrieStringMatcher readConfiguration(Reader reader) throws IOException {
+
+ BufferedReader in = new BufferedReader(reader);
+ List<String> urlprefixes = new ArrayList<String>();
+ String line;
+
+ while ((line = in.readLine()) != null) {
+ if (line.length() == 0)
+ continue;
+
+ char first = line.charAt(0);
+ switch (first) {
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
+ continue;
+ default:
+ urlprefixes.add(line);
+ }
+ }
+
+ return new PrefixStringMatcher(urlprefixes);
+ }
+
+ public static void main(String args[]) throws IOException {
+
+ PrefixURLFilter filter;
+ if (args.length >= 1)
+ filter = new PrefixURLFilter(args[0]);
+ else
+ filter = new PrefixURLFilter();
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while ((line = in.readLine()) != null) {
+ String out = filter.filter(line);
+ if (out != null) {
+ System.out.println(out);
+ }
+ }
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ String pluginName = "urlfilter-prefix";
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+ if (attributeFile != null && attributeFile.trim().equals(""))
+ attributeFile = null;
+ if (attributeFile != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ }
+ } else {
+ // if (LOG.isWarnEnabled()) {
+ // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+ // plugin "+pluginName);
+ // }
+ }
+
+ String file = conf.get("urlfilter.prefix.file");
+ String stringRules = conf.get("urlfilter.prefix.rules");
+ // attribute "file" takes precedence if defined
+ if (attributeFile != null)
+ file = attributeFile;
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
+
+ if (reader == null) {
+ trie = new PrefixStringMatcher(new String[0]);
+ } else {
+ try {
+ trie = readConfiguration(reader);
+ } catch (IOException e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+ // TODO mb@media-style.com: throw Exception? Because broken api.
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
new file mode 100644
index 0000000..dbed0be
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>URL filter plugin to include only URLs which match one of a given list of URL prefixes.</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
new file mode 100644
index 0000000..b7a7ce4
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.prefix;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+import java.io.IOException;
+
+
+/**
+ * JUnit test for <code>PrefixURLFilter</code>.
+ *
+ * @author Talat Uyarer
+ * @author Cihad Guzel
+ */
+public class TestPrefixURLFilter extends TestCase {
+ private static final String prefixes =
+ "# this is a comment\n" +
+ "\n" +
+ "http://\n" +
+ "https://\n" +
+ "file://\n" +
+ "ftp://\n";
+
+ private static final String[] urls = new String[] {
+ "http://www.example.com/",
+ "https://www.example.com/",
+ "ftp://www.example.com/",
+ "file://www.example.com/",
+ "abcd://www.example.com/",
+ "www.example.com/",
+ };
+
+ private static String[] urlsModeAccept = new String[] {
+ urls[0],
+ urls[1],
+ urls[2],
+ urls[3],
+ null,
+ null
+ };
+
+ private PrefixURLFilter filter = null;
+
+ public static Test suite() {
+ return new TestSuite(TestPrefixURLFilter.class);
+ }
+
+ public static void main(String[] args) {
+ TestRunner.run(suite());
+ }
+
+ public void setUp() throws IOException {
+ filter = new PrefixURLFilter(prefixes);
+ }
+
+ public void testModeAccept() {
+ for (int i = 0; i < urls.length; i++) {
+ assertTrue(urlsModeAccept[i] == filter.filter(urls[i]));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/build.xml b/nutch-plugins/urlfilter-regex/build.xml
new file mode 100644
index 0000000..5b80d08
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-regex" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-regex-filter/*.jar" />
+ </fileset>
+ <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+ </path>
+
+ <!-- Compile test classes for dependencies -->
+ <target name="deps-test-compile">
+ <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+ </copy>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/ivy.xml b/nutch-plugins/urlfilter-regex/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/plugin.xml b/nutch-plugins/urlfilter-regex/plugin.xml
new file mode 100644
index 0000000..34f4a91
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-regex"
+ name="Regex URL Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-regex.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-regex-filter"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.regex"
+ name="Nutch Regex URL Filter"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="RegexURLFilter"
+ class="org.apache.nutch.urlfilter.regex.RegexURLFilter"/>
+ <!-- by default, attribute "file" is undefined, to keep classic behavior.
+ <implementation id="RegexURLFilter"
+ class="org.apache.nutch.net.RegexURLFilter">
+ <parameter name="file" value="urlfilter-regex.txt"/>
+ </implementation>
+ -->
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/pom.xml b/nutch-plugins/urlfilter-regex/pom.xml
new file mode 100644
index 0000000..db9e7bd
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/pom.xml
@@ -0,0 +1,53 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlfilter-regex</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlfilter-regex</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>lib-regex-filter</artifactId>
+ <version>${project.parent.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>lib-regex-filter</artifactId>
+ <version>${project.parent.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+ </dependencies>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
new file mode 100644
index 0000000..2988114
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Filters URLs based on a file of regular expressions using the
+ * {@link java.util.regex Java Regex implementation}.
+ */
+public class RegexURLFilter extends RegexURLFilterBase {
+
+ public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
+ public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
+
+ public RegexURLFilter() {
+ super();
+ }
+
+ public RegexURLFilter(String filename) throws IOException,
+ PatternSyntaxException {
+ super(filename);
+ }
+
+ RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException {
+ super(reader);
+ }
+
+ /*
+ * ----------------------------------- * <implementation:RegexURLFilterBase> *
+ * -----------------------------------
+ */
+
+ /**
+ * Rules specified as a config property will override rules specified as a
+ * config file.
+ */
+ protected Reader getRulesReader(Configuration conf) throws IOException {
+ String stringRules = conf.get(URLFILTER_REGEX_RULES);
+ if (stringRules != null) {
+ return new StringReader(stringRules);
+ }
+ String fileRules = conf.get(URLFILTER_REGEX_FILE);
+ return conf.getConfResourceAsReader(fileRules);
+ }
+
+ // Inherited Javadoc
+ protected RegexRule createRule(boolean sign, String regex) {
+ return new Rule(sign, regex);
+ }
+
+ protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
+ return new Rule(sign, regex, hostOrDomain);
+ }
+
+
+
+ /*
+ * ------------------------------------ * </implementation:RegexURLFilterBase>
+ * * ------------------------------------
+ */
+
+ public static void main(String args[]) throws IOException {
+ RegexURLFilter filter = new RegexURLFilter();
+ filter.setConf(NutchConfiguration.create());
+ main(filter, args);
+ }
+
+ private class Rule extends RegexRule {
+
+ private Pattern pattern;
+
+ Rule(boolean sign, String regex) {
+ this(sign, regex, null);
+ }
+
+ Rule(boolean sign, String regex, String hostOrDomain) {
+ super(sign, regex, hostOrDomain);
+ pattern = Pattern.compile(regex);
+ }
+
+ protected boolean match(String url) {
+ return pattern.matcher(url).find();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
new file mode 100644
index 0000000..7acf73b
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>URL filter plugin to include and/or exclude URLs matching Java regular expressions.</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
new file mode 100644
index 0000000..b86181e
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.*;
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based test of class <code>RegexURLFilter</code>.
+ *
+ * @author Jérôme Charron
+ */
+public class TestRegexURLFilter extends RegexURLFilterBaseTest {
+
+ protected URLFilter getURLFilter(Reader rules) {
+ try {
+ return new RegexURLFilter(rules);
+ } catch (IOException e) {
+ Assert.fail(e.toString());
+ return null;
+ }
+ }
+
+ @Test
+ public void test() {
+ test("WholeWebCrawling");
+ test("IntranetCrawling");
+ bench(50, "Benchmarks");
+ bench(100, "Benchmarks");
+ bench(200, "Benchmarks");
+ bench(400, "Benchmarks");
+ bench(800, "Benchmarks");
+ }
+
+ @Test
+ public void test1838() {
+ test("nutch1838");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
new file mode 100644
index 0000000..c8901e2
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'. The first matching pattern in the file
+# determines whether a URL is included or ignored. If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip .fr .org and .net domains
+-^.*//.*\.fr/
+-^.*//.*\.org/
+-^.*//.*\.net/
+
+# skip everything else
++.
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
new file mode 100644
index 0000000..40bf4ee
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&twin=n&ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file