You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:10 UTC

[26/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
new file mode 100644
index 0000000..0be1e31
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domain;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestDomainURLFilter {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  @Test
+  public void testFilter() throws Exception {
+
+    String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    domainFilter.setConf(conf);
+    Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
+    Assert.assertNull(domainFilter.filter("http://www.google.com"));
+    Assert.assertNull(domainFilter.filter("http://mail.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
+    Assert.assertNull(domainFilter.filter("http://www.adobe.com"));
+  }
+  
+  @Test
+  public void testNoFilter() throws Exception {
+    // https://issues.apache.org/jira/browse/NUTCH-2189
+    String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    domainFilter.setConf(conf);
+    Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.google.com"));
+    Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
+    Assert.assertNotNull(domainFilter.filter("http://www.adobe.com"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domainblacklist/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/build.xml b/nutch-plugins/urlfilter-domainblacklist/build.xml
new file mode 100644
index 0000000..19ea483
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domainblacklist" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domainblacklist/data/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/data/hosts.txt b/nutch-plugins/urlfilter-domainblacklist/data/hosts.txt
new file mode 100644
index 0000000..2b88c3b
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/data/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domainblacklist/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/ivy.xml b/nutch-plugins/urlfilter-domainblacklist/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domainblacklist/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/plugin.xml b/nutch-plugins/urlfilter-domainblacklist/plugin.xml
new file mode 100644
index 0000000..04eee6e
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-domainblacklist"
+   name="Domain Blacklist URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-domainblacklist.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
+              name="Nutch Domain Blacklist URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="DomainBlacklistURLFilter"
+        class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter">
+        <parameter name="file" value="domainblacklist-urlfilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domainblacklist/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/pom.xml b/nutch-plugins/urlfilter-domainblacklist/pom.xml
new file mode 100644
index 0000000..a814579
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-domainblacklist</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-domainblacklist</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
new file mode 100644
index 0000000..37b1cdc
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. A url that matches one of the suffixes, domains, or hosts present
+ * in the file is filtered out.
+ * </p>
+ * 
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ * 
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ * 
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ * 
+ * The domain file defaults to domainblacklist-urlfilter.txt in the classpath
+ * but can be overridden using the:
+ * 
+ * <ul>
+ * <ol>
+ * property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
+ * 
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainBlacklistURLFilter implements URLFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainBlacklistURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+  private Configuration conf;
+  private String domainFile = null;
+  private Set<String> domainSet = new LinkedHashSet<String>();
+
+  private void readConfiguration(Reader configReader) throws IOException {
+
+    // read the configuration file, line by line
+    BufferedReader reader = new BufferedReader(configReader);
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // add non-blank lines and non-commented lines
+        domainSet.add(StringUtils.lowerCase(line.trim()));
+      }
+    }
+  }
+
+  /**
+   * Default constructor.
+   */
+  public DomainBlacklistURLFilter() {
+
+  }
+
+  /**
+   * Constructor that specifies the domain file to use.
+   * 
+   * @param domainFile
+   *          The domain file, overrides domainblacklist-urlfilter.text default.
+   * 
+   * @throws IOException
+   */
+  public DomainBlacklistURLFilter(String domainFile) {
+    this.domainFile = domainFile;
+  }
+
+  /**
+   * Sets the configuration.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlfilter-domainblacklist";
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
+      }
+    } else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+            + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlfilter.domainblacklist.file");
+    String stringRules = conf.get("urlfilter.domainblacklist.rules");
+    if (domainFile != null) {
+      file = domainFile;
+    } else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public String filter(String url) {
+    try {
+      // match for suffix, domain, and host in that order. more general will
+      // override more specific
+      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+      String host = URLUtil.getHost(url);
+      String suffix = null;
+      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+      if (domainSuffix != null) {
+        suffix = domainSuffix.getDomain();
+      }
+
+      if (domainSet.contains(suffix) || domainSet.contains(domain)
+          || domainSet.contains(host)) {
+        // Matches, filter!
+        return null;
+      }
+
+      // doesn't match, allow
+      return url;
+    } catch (Exception e) {
+
+      // if an error happens, allow the url to pass
+      LOG.error("Could not apply filter on url: " + url + "\n"
+          + org.apache.hadoop.util.StringUtils.stringifyException(e));
+      return null;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
new file mode 100644
index 0000000..1f0022c
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin to exclude URLs by domain suffixes, domain names, and/or host names.
+ * See {@link org.apache.nutch.urlfilter.domain} for the counterpart (include only URLs
+ * matching host or domain).
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
new file mode 100644
index 0000000..d253867
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestDomainBlacklistURLFilter {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  @Test
+  public void testFilter() throws Exception {
+
+    String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(
+        domainBlacklistFile);
+    domainBlacklistFilter.setConf(conf);
+    Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
+    Assert.assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
+    Assert.assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
+    Assert.assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-ignoreexempt/README.md
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/README.md b/nutch-plugins/urlfilter-ignoreexempt/README.md
new file mode 100644
index 0000000..d48b672
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/README.md
@@ -0,0 +1,43 @@
+urlfilter-ignoreexempt
+======================
+  This plugin allows certain urls to be exempted when the external links are configured to be ignored.
+  This is useful when focused crawl is setup but some resources like static files are linked from CDNs (external domains).
+
+# How to enable ?
+Add `urlfilter-ignoreexempt` value to `plugin.includes` property
+```xml
+<property>
+  <name>plugin.includes</name>
+  <value>protocol-http|urlfilter-(regex|ignoreexempt)...</value>
+</property>
+```
+
+# How to configure rules?
+
+open `conf/db-ignore-external-exemptions.txt` and add the regex rules.
+
+## Format :
+
+The format is same same as `regex-urlfilter.txt`.
+ Each non-comment, non-blank line contains a regular expression
+ prefixed by '+' or '-'.  The first matching pattern in the file
+ determines whether a URL is exempted or ignored.  If no pattern
+ matches, the URL is ignored.
+
+
+## Example :
+
+ To exempt urls ending with image extensions, use this rule
+
+`+(?i)\.(jpg|png|gif)$`
+
+   
+   
+## Testing the Rules :
+
+After enabling the plugin and adding your rules to `conf/db-ignore-external-exemptions.txt`, run:
+   
+`bin/nutch plugin urlfilter-ignoreexempt  org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here`
+
+
+This should print `true` for urls which are accepted by configured rules.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-ignoreexempt/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/build.xml b/nutch-plugins/urlfilter-ignoreexempt/build.xml
new file mode 100644
index 0000000..105f551
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-ignoreexempt" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+      <include name="**/urlfilter-regex/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+    <pathelement location="${nutch.root}/build/urlfilter-regex/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+    <ant target="compile-test" inheritall="false" dir="../urlfilter-regex"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+    <ant target="deploy" inheritall="false" dir="../urlfilter-regex"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-ignoreexempt/data/.donotdelete
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/data/.donotdelete b/nutch-plugins/urlfilter-ignoreexempt/data/.donotdelete
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/ivy.xml b/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/plugin.xml b/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
new file mode 100644
index 0000000..4139ca4
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-ignoreexempt"
+   name="External Domain Ignore Exemption"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-ignoreexempt.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+      <import plugin="urlfilter-regex"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.ignoreexempt"
+              name="Ignore Exemption Url Filter"
+              point="org.apache.nutch.net.URLExemptionFilter">
+      <implementation id="ExemptionUrlFilter"
+        class="org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter">
+        <parameter name="file" value="db-ignore-external-exemptions.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-ignoreexempt/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/pom.xml b/nutch-plugins/urlfilter-ignoreexempt/pom.xml
new file mode 100644
index 0000000..fd26587
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-ignoreexempt</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-ignoreexempt</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>urlfilter-regex</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
new file mode 100644
index 0000000..bbac300
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLExemptionFilter;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.urlfilter.regex.RegexURLFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.regex.Pattern;
+import java.util.List;
+import java.util.ArrayList;
+
+
+/**
+ * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration
+ * to check if URL is eligible for exemption from 'db.ignore.external'.
+ * When this filter is enabled, the external urls will be checked against configured sequence of regex rules.
+ *<p>
+ * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be
+ * overridden using the property  <code>"db.ignore.external.exemptions.file" in ./conf/nutch-*.xml</code>
+ *</p>
+ *
+ * The exemption rules are specified in plain text file where each line is a rule.
+ * The format is same same as `regex-urlfilter.txt`.
+ * Each non-comment, non-blank line contains a regular expression
+ * prefixed by '+' or '-'.  The first matching pattern in the file
+ * determines whether a URL is exempted or ignored.  If no pattern
+ * matches, the URL is ignored.
+ *
+ * @since Feb 10, 2016
+ * @version 1
+ * @see org.apache.nutch.net.URLExemptionFilter
+ * @see org.apache.nutch.urlfilter.regex.RegexURLFilter
+ */
+public class ExemptionUrlFilter extends RegexURLFilter
+    implements URLExemptionFilter {
+
+  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE
+      = "db.ignore.external.exemptions.file";
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ExemptionUrlFilter.class);
+
+  private List<Pattern> exemptions;
+  private Configuration conf;
+
+  public List<Pattern> getExemptions() {
+    return exemptions;
+  }
+
+  @Override
+  public boolean filter(String fromUrl, String toUrl) {
+    //this implementation does not consider fromUrl param.
+    //the regex rules are applied to toUrl.
+    return this.filter(toUrl) != null;
+  }
+
+  /**
+   * Gets reader for regex rules
+   */
+  protected Reader getRulesReader(Configuration conf)
+      throws IOException {
+    String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
+    return conf.getConfResourceAsReader(fileRules);
+  }
+
+  public static void main(String[] args) {
+
+    if (args.length != 1) {
+      System.out.println("Error: Invalid Args");
+      System.out.println("Usage: " +
+          ExemptionUrlFilter.class.getName() + " <url>");
+      return;
+    }
+    String url = args[0];
+    ExemptionUrlFilter instance = new ExemptionUrlFilter();
+    instance.setConf(NutchConfiguration.create());
+    System.out.println(instance.filter(null, url));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
new file mode 100644
index 0000000..ee949c5
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin which identifies exemptions to external urls when
+ * when external urls are set to ignore.
+ *
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-prefix/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/build.xml b/nutch-plugins/urlfilter-prefix/build.xml
new file mode 100644
index 0000000..33faa48
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-prefix" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-prefix/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/ivy.xml b/nutch-plugins/urlfilter-prefix/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-prefix/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/plugin.xml b/nutch-plugins/urlfilter-prefix/plugin.xml
new file mode 100644
index 0000000..22cfcaf
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-prefix"
+   name="Prefix URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-prefix.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.prefix"
+              name="Nutch Prefix URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="PrefixURLFilter"
+                      class="org.apache.nutch.urlfilter.prefix.PrefixURLFilter"/>
+      <!-- by default, attribute "file" is undefined, to keep classic behavior.
+      <implementation id="PrefixURLFilter"
+                      class="org.apache.nutch.net.PrefixURLFilter">
+        <parameter name="file" value="urlfilter-prefix.txt"/>
+      </implementation>
+      -->
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-prefix/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/pom.xml b/nutch-plugins/urlfilter-prefix/pom.xml
new file mode 100644
index 0000000..65ad019
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-prefix</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-prefix</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
new file mode 100644
index 0000000..2e955b5
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.prefix;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.*;
+
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.StringReader;
+
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * Filters URLs based on a file of URL prefixes. The file is named by (1)
+ * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2)
+ * attribute "file" in plugin.xml of this plugin Attribute "file" has higher
+ * precedence if defined.
+ * 
+ * <p>
+ * The format of this file is one URL prefix per line.
+ * </p>
+ */
+public class PrefixURLFilter implements URLFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(PrefixURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+
+  private TrieStringMatcher trie;
+
+  private Configuration conf;
+
+  public PrefixURLFilter() throws IOException {
+
+  }
+
+  public PrefixURLFilter(String stringRules) throws IOException {
+    trie = readConfiguration(new StringReader(stringRules));
+  }
+
+  public String filter(String url) {
+    if (trie.shortestMatch(url) == null)
+      return null;
+    else
+      return url;
+  }
+
+  private TrieStringMatcher readConfiguration(Reader reader) throws IOException {
+
+    BufferedReader in = new BufferedReader(reader);
+    List<String> urlprefixes = new ArrayList<String>();
+    String line;
+
+    while ((line = in.readLine()) != null) {
+      if (line.length() == 0)
+        continue;
+
+      char first = line.charAt(0);
+      switch (first) {
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        continue;
+      default:
+        urlprefixes.add(line);
+      }
+    }
+
+    return new PrefixStringMatcher(urlprefixes);
+  }
+
+  public static void main(String args[]) throws IOException {
+
+    PrefixURLFilter filter;
+    if (args.length >= 1)
+      filter = new PrefixURLFilter(args[0]);
+    else
+      filter = new PrefixURLFilter();
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
+        System.out.println(out);
+      }
+    }
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    String pluginName = "urlfilter-prefix";
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+    if (attributeFile != null && attributeFile.trim().equals(""))
+      attributeFile = null;
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
+      }
+    } else {
+      // if (LOG.isWarnEnabled()) {
+      // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+      // plugin "+pluginName);
+      // }
+    }
+
+    String file = conf.get("urlfilter.prefix.file");
+    String stringRules = conf.get("urlfilter.prefix.rules");
+    // attribute "file" takes precedence if defined
+    if (attributeFile != null)
+      file = attributeFile;
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+
+    if (reader == null) {
+      trie = new PrefixStringMatcher(new String[0]);
+    } else {
+      try {
+        trie = readConfiguration(reader);
+      } catch (IOException e) {
+        if (LOG.isErrorEnabled()) {
+          LOG.error(e.getMessage());
+        }
+        // TODO mb@media-style.com: throw Exception? Because broken api.
+        throw new RuntimeException(e.getMessage(), e);
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
new file mode 100644
index 0000000..dbed0be
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>URL filter plugin to include only URLs which match one of a given list of URL prefixes.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
new file mode 100644
index 0000000..b7a7ce4
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.prefix;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+import java.io.IOException;
+
+
+/**
+ * JUnit test for <code>PrefixURLFilter</code>.
+ *
+ * @author Talat Uyarer
+ * @author Cihad Guzel
+ */
+public class TestPrefixURLFilter extends TestCase {
+  private static final String prefixes =
+    "# this is a comment\n" +
+    "\n" +
+    "http://\n" +
+    "https://\n" +
+    "file://\n" +
+    "ftp://\n";
+
+  private static final String[] urls = new String[] {
+    "http://www.example.com/",
+    "https://www.example.com/",
+    "ftp://www.example.com/",
+    "file://www.example.com/",
+    "abcd://www.example.com/",
+    "www.example.com/",
+  };
+
+  private static String[] urlsModeAccept = new String[] {
+    urls[0],
+    urls[1],
+    urls[2],
+    urls[3],
+    null,
+    null
+  };
+
+  private PrefixURLFilter filter = null;
+
+  public static Test suite() {
+    return new TestSuite(TestPrefixURLFilter.class);
+  }
+
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  public void setUp() throws IOException {
+    filter = new PrefixURLFilter(prefixes);
+  }
+
+  public void testModeAccept() {
+    for (int i = 0; i < urls.length; i++) {
+      assertTrue(urlsModeAccept[i] == filter.filter(urls[i]));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/build.xml b/nutch-plugins/urlfilter-regex/build.xml
new file mode 100644
index 0000000..5b80d08
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-regex" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/ivy.xml b/nutch-plugins/urlfilter-regex/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/plugin.xml b/nutch-plugins/urlfilter-regex/plugin.xml
new file mode 100644
index 0000000..34f4a91
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-regex"
+   name="Regex URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-regex.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.regex"
+              name="Nutch Regex URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="RegexURLFilter"
+                      class="org.apache.nutch.urlfilter.regex.RegexURLFilter"/>
+      <!-- by default, attribute "file" is undefined, to keep classic behavior.
+      <implementation id="RegexURLFilter"
+                      class="org.apache.nutch.net.RegexURLFilter">
+        <parameter name="file" value="urlfilter-regex.txt"/>
+      </implementation>
+      -->
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/pom.xml b/nutch-plugins/urlfilter-regex/pom.xml
new file mode 100644
index 0000000..237a5b9
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/pom.xml
@@ -0,0 +1,46 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-regex</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-regex</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-regex-filter</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/sample/Benchmarks.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/Benchmarks.rules b/nutch-plugins/urlfilter-regex/sample/Benchmarks.rules
new file mode 100644
index 0000000..c8901e2
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/sample/Benchmarks.rules
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip .fr .org and .net domains
+-^.*//.*\.fr/
+-^.*//.*\.org/
+-^.*//.*\.net/
+
+# skip everything else
++.

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls b/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls
new file mode 100644
index 0000000..40bf4ee
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/sample/Benchmarks.urls
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules b/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules
new file mode 100644
index 0000000..705bdb2
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.rules
@@ -0,0 +1,27 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
+# accept hosts in MY.DOMAIN.NAME
++^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
+
+# skip everything else
+-.

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls b/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls
new file mode 100644
index 0000000..b1ad9b7
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/sample/IntranetCrawling.urls
@@ -0,0 +1,8 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
+-news://any.news.server/comp.lang.java
+-whois:/nutch.org
++http://MY.DOMAIN.NAME/
++http://MY.DOMAIN.NAME/nutch
++http://www.MY.DOMAIN.NAME/

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules b/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules
new file mode 100644
index 0000000..8778921
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.rules
@@ -0,0 +1,22 @@
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file: ftp: and mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
+# accept anything else
++.

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls b/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls
new file mode 100644
index 0000000..ccb6269
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/sample/WholeWebCrawling.urls
@@ -0,0 +1,11 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
++news://any.news.server/comp.lang.java
++whois:/nutch.org
+-http://www.nutch.org/nutch.gif
+-http://www.nutch.org/nutch.eps
+-http://www.nutch.org/nutch?q=nutch
++http://www.nutch.org/
+-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
+-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/sample/nutch1838.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/nutch1838.rules b/nutch-plugins/urlfilter-regex/sample/nutch1838.rules
new file mode 100644
index 0000000..f7b0d13
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/sample/nutch1838.rules
@@ -0,0 +1,12 @@
+# Skip all url's containing skip for example.org
+> www.example.org
+-skip
+<
+
+# Allow all url's containing skip for example.com
+> www.example.com
++skip
+<
+
+# Skip everything else
+-.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/sample/nutch1838.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/sample/nutch1838.urls b/nutch-plugins/urlfilter-regex/sample/nutch1838.urls
new file mode 100644
index 0000000..c6f29d1
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/sample/nutch1838.urls
@@ -0,0 +1,3 @@
+-http://www.example.org/skip-me-now
++http://www.example.com/noone-can-skip-me
+-http://www.example.nl/i-am-filtered
\ No newline at end of file